This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new b9a0d9889b TIKA-4689 - streamline embedded file naming (#2692)
b9a0d9889b is described below
commit b9a0d9889b999b496680fdca03db245fe8b62b73
Author: Tim Allison <[email protected]>
AuthorDate: Thu Mar 12 12:32:52 2026 -0400
TIKA-4689 - streamline embedded file naming (#2692)
---
.../tika/extractor/EmbeddedDocumentUtil.java | 83 ++++++++++++++++++++++
.../java/org/apache/tika/io/FilenameUtils.java | 12 +---
.../apache/tika/metadata/TikaCoreProperties.java | 8 +++
.../org/apache/tika/parser/AutoDetectParser.java | 6 +-
.../apache/tika/parser/RecursiveParserWrapper.java | 7 +-
.../tika/parser/sqlite3/SQLite3ParserTest.java | 6 +-
.../apache/tika/parser/jdbc/JDBCTableReader.java | 1 +
.../microsoft/MSEmbeddedStreamTranslator.java | 1 +
.../parser/microsoft/AbstractPOIFSExtractor.java | 3 +
.../tika/parser/microsoft/HSLFExtractor.java | 40 +++++++++--
.../apache/tika/parser/microsoft/TNEFParser.java | 22 ++++--
.../tika/parser/microsoft/WordExtractor.java | 9 ++-
.../parser/microsoft/rtf/RTFEmbObjHandler.java | 8 ++-
.../parser/microsoft/rtf/RTFObjDataParser.java | 7 +-
.../microsoft/POIContainerExtractionTest.java | 8 +--
.../parser/microsoft/PowerPointParserTest.java | 9 +--
.../tika/parser/microsoft/WordParserTest.java | 6 +-
.../tika/parser/pdf/image/ImageGraphicsEngine.java | 4 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 6 +-
.../microsoft/POIContainerExtractionTest.java | 64 ++++++++---------
.../tika/parser/microsoft/rtf/RTFParserTest.java | 18 ++---
.../org/apache/tika/parser/pdf/PDFParserTest.java | 4 +-
.../core/extractor/StandardUnpackSelector.java | 13 +---
.../tika/pipes/core/server/ParseHandler.java | 4 +-
24 files changed, 247 insertions(+), 102 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
index 741a14f58a..324840ce84 100644
---
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
+++
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
@@ -177,6 +177,89 @@ public class EmbeddedDocumentUtil implements Serializable {
return ".bin";
}
+ /**
+ * Looks up the file extension for a given media type string.
+ *
+ * @param mediaType the media type string (e.g., "image/png")
+ * @return the extension including the dot (e.g., ".png"), or empty string
if unknown
+ */
+ /**
+ * Normalizes internal OCR routing media types (e.g., {@code
image/ocr-png})
+ * back to standard media types (e.g., {@code image/png}).
+ * Returns the input unchanged if it is not an OCR routing type.
+ *
+ * @param mediaType the media type string
+ * @return the normalized media type string, or the original if no
normalization needed
+ */
+ public static String normalizeMediaType(String mediaType) {
+ if (mediaType != null && mediaType.startsWith("image/ocr-")) {
+ return "image/" + mediaType.substring("image/ocr-".length());
+ }
+ return mediaType;
+ }
+
+ public static String getExtensionForMediaType(String mediaType) {
+ if (mediaType == null) {
+ return "";
+ }
+ mediaType = normalizeMediaType(mediaType);
+ try {
+ MimeType mimeType =
MimeTypes.getDefaultMimeTypes().forName(mediaType);
+ return mimeType.getExtension();
+ } catch (MimeTypeException e) {
+ return "";
+ }
+ }
+
+ /**
+ * Type of embedded resource, used for generating canonical resource names.
+ */
+ public enum EmbeddedResourcePrefix {
+ EMBEDDED("embedded"),
+ IMAGE("image"),
+ THUMBNAIL("thumbnail");
+
+ private final String prefix;
+
+ EmbeddedResourcePrefix(String prefix) {
+ this.prefix = prefix;
+ }
+
+ public String getPrefix() {
+ return prefix;
+ }
+ }
+
+ /**
+ * Generates a canonical resource name from a type, counter, and media
type.
+ * For example: {@code
generateResourceName(EmbeddedResourcePrefix.EMBEDDED, 0, "image/png")}
+ * returns {@code "embedded-0.png"}.
+ *
+ * @param type the embedded resource type
+ * @param count the counter value
+ * @param mediaType the media type string, or null if unknown
+ * @return the generated resource name with extension
+ */
+ public static String generateResourceName(EmbeddedResourcePrefix type, int
count,
+ String mediaType) {
+ return type.getPrefix() + "-" + count +
getExtensionForMediaType(mediaType);
+ }
+
+ /**
+ * Sets a generated resource name on the metadata and marks the extension
as inferred.
+ *
+ * @param metadata the metadata to update
+ * @param type the embedded resource type
+ * @param count the counter value
+ * @param mediaType the media type string, or null if unknown
+ */
+ public static void setGeneratedResourceName(Metadata metadata,
EmbeddedResourcePrefix type,
+ int count, String mediaType) {
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
+ generateResourceName(type, count, mediaType));
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED,
true);
+ }
+
public static void recordException(Throwable t, Metadata m) {
String ex = ExceptionUtils.getFilteredStackTrace(t);
m.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ex);
diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
index 2d3a7169ed..777f8482fb 100644
--- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
@@ -21,6 +21,7 @@ import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MimeTypeException;
@@ -345,19 +346,12 @@ public class FilenameUtils {
if (mime == null) {
return defaultValue;
}
+ // Normalize OCR routing types (e.g., image/ocr-png -> image/png)
+ mime = EmbeddedDocumentUtil.normalizeMediaType(mime);
String ext = lookupExtension(mime);
if (ext != null) {
return ext;
}
- // Handle OCR media types (e.g., image/ocr-jpeg -> image/jpeg)
- // These are internal routing types that don't have registered
extensions
- if (mime.startsWith("image/ocr-")) {
- String normalized = "image/" +
mime.substring("image/ocr-".length());
- ext = lookupExtension(normalized);
- if (ext != null) {
- return ext;
- }
- }
return ".bin";
}
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index c6aad5e453..6d513a2a67 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -216,6 +216,14 @@ public interface TikaCoreProperties {
"detected_language_confidence_raw");
Property RESOURCE_NAME_KEY = Property.internalText(TIKA_META_PREFIX +
"resourceName");
+
+ /**
+ * Indicates that the file extension on the resource name was inferred by
Tika
+ * (e.g., from content type detection) rather than provided by the
original document.
+ */
+ Property RESOURCE_NAME_EXTENSION_INFERRED =
+ Property.externalBoolean(TIKA_META_PREFIX +
"resourceNameExtensionInferred");
+
Property EMBEDDED_RELATIONSHIP_ID = Property.internalText(TIKA_META_PREFIX
+ "embeddedRelationshipId");
String EMBEDDED_RESOURCE_TYPE_KEY = "embeddedResourceType";
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index 623b027ece..6867c622d2 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -28,6 +28,7 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.extractor.StandardExtractorFactory;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -155,7 +156,10 @@ public class AutoDetectParser extends CompositeParser {
// Automatically detect the MIME type of the document
MediaType type = detector.detect(tis, metadata, context);
- metadata.set(Metadata.CONTENT_TYPE, type.toString());
+ // Normalize OCR routing types (e.g., image/ocr-png -> image/png) so
they
+ // don't leak into CONTENT_TYPE
+ metadata.set(Metadata.CONTENT_TYPE,
+ EmbeddedDocumentUtil.normalizeMediaType(type.toString()));
//check for zero-byte inputstream
if (tis.getOpenContainer() == null) {
if (autoDetectParserConfig.getThrowOnZeroBytes()) {
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 22faaf86e4..dac2bf5fda 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -29,6 +29,7 @@ import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.exception.ZeroByteFileException;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.extractor.ParentContentHandler;
import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TikaInputStream;
@@ -192,7 +193,11 @@ public class RecursiveParserWrapper extends
ParserDecorator {
} else if (metadata.get(TikaCoreProperties.VERSION_NUMBER) != null) {
objectName = "version-number-" +
metadata.get(TikaCoreProperties.VERSION_NUMBER);
} else {
- objectName = "embedded-" + counter.incrementAndGet();
+ objectName = EmbeddedDocumentUtil.generateResourceName(
+ EmbeddedDocumentUtil.EmbeddedResourcePrefix.EMBEDDED,
+ counter.incrementAndGet(),
+ metadata.get(Metadata.CONTENT_TYPE));
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED,
true);
}
//make sure that there isn't any path info in the objectName
//some parsers can return paths, not just file names
diff --git
a/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java
b/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java
index 5519ddbcc2..e5eb24b9cd 100644
---
a/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java
+++
b/tika-parsers/tika-parsers-extended/tika-parsers-extended-integration-tests/src/test/java/org/apache/tika/parser/sqlite3/SQLite3ParserTest.java
@@ -90,7 +90,7 @@ public class SQLite3ParserTest extends TikaTest {
//timestamp test
assertContains("2015-01-03 15:17:03", x);
//first embedded doc's image tag
- assertContains("alt=\"image1.png\"", x);
+ assertContains("alt=\"image-1.png\"", x);
//second embedded doc's image tag
assertContains("alt=\"A description...\"", x);
//second table name
@@ -123,7 +123,7 @@ public class SQLite3ParserTest extends TikaTest {
xml);
//but no other content
assertNotContained("dog", xml);
- assertNotContained("alt=\"image1.png\"", xml);
+ assertNotContained("alt=\"image-1.png\"", xml);
//second embedded doc's image tag
assertNotContained("alt=\"A description...\"", xml);
}
@@ -158,7 +158,7 @@ public class SQLite3ParserTest extends TikaTest {
metadataList.get(4).get(TikaCoreProperties.TIKA_CONTENT));
//confirm .doc was added to blob
- assertEquals("/BYTES_COL_0.doc/image1.png",
+ assertEquals("/BYTES_COL_0.doc/image-1.png",
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java
index 1038e8ed5d..346262510d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java
@@ -234,6 +234,7 @@ public class JDBCTableReader {
//just in case something screwy is going on with the
column name
FilenameUtils.normalize(
FilenameUtils.getName(columnName + "_" + rowNum +
extension)));
+ m.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true);
if (embeddedDocumentUtil.shouldParseEmbedded(m)) {
embeddedDocumentUtil.parseEmbedded(tis, handler, m, true);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
index fb9704d367..fd4c47a99f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
@@ -77,6 +77,7 @@ public class MSEmbeddedStreamTranslator implements
EmbeddedStreamTranslator {
}
} else {
name += '.' + type.getExtension();
+
metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true);
}
os.write(data);
os.flush();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 23d4445606..09a343b914 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -214,6 +214,7 @@ abstract class AbstractPOIFSExtractor {
} else {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
name + '.' + type.getExtension());
+
metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true);
}
}
parseEmbedded(dir, xhtml, metadata, outputHtml);
@@ -225,6 +226,7 @@ abstract class AbstractPOIFSExtractor {
} else {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
rName + '.' + type.getExtension());
+
metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true);
}
}
parseEmbedded(dir, xhtml, metadata, outputHtml);
@@ -319,6 +321,7 @@ abstract class AbstractPOIFSExtractor {
// Record what we can do about it
metadata.set(Metadata.CONTENT_TYPE, mediaType.getType());
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName +
extension);
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED,
true);
metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
parseEmbedded(parentDir, tis, xhtml, metadata, outputHtml);
} finally {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index aa8c8a9682..e2f10977bc 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -60,6 +60,7 @@ import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -220,8 +221,12 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
continue;
}
String filename = d.getFileName();
+ boolean inferredExtension = false;
if (StringUtils.isBlank(filename)) {
- filename = "UNKNOWN-" + i;
+ filename = EmbeddedDocumentUtil.generateResourceName(
+ EmbeddedDocumentUtil.EmbeddedResourcePrefix.EMBEDDED,
+ i, getDetectedMediaType(d));
+ inferredExtension = true;
}
try (TikaInputStream tis =
TikaInputStream.get(d.getInputStream())) {
if (FileMagic.valueOf(tis) == FileMagic.OLE2) {
@@ -231,7 +236,13 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
if (pfs.getRoot().getEntryNames().size() < 1) {
return;
}
- handleEmbeddedOfficeDoc(pfs.getRoot(), filename,
xhtml, outputHtml);
+ Metadata metadata = Metadata.newInstance(context);
+ if (inferredExtension) {
+
metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED,
+ true);
+ }
+ handleEmbeddedOfficeDoc(pfs.getRoot(), metadata,
filename,
+ xhtml, outputHtml);
}
} else {
boolean shouldProcess = false;
@@ -243,7 +254,13 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
tis.reset();
}
if (shouldProcess) {
- handleEmbeddedResource(tis, filename, null, null,
xhtml, true);
+ Metadata metadata = Metadata.newInstance(context);
+ if (inferredExtension) {
+
metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED,
+ true);
+ }
+ handleEmbeddedResource(tis, metadata, filename, null,
+ null, null, xhtml, true);
}
}
} catch (IOException | TikaException e) {
@@ -563,7 +580,13 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
continue;
}
try (TikaInputStream picIs = TikaInputStream.get(data)) {
- handleEmbeddedResource(picIs, null, null, mediaType, xhtml,
false);
+ String picName = EmbeddedDocumentUtil.generateResourceName(
+ EmbeddedDocumentUtil.EmbeddedResourcePrefix.IMAGE,
+ pic.getIndex(), mediaType);
+ Metadata picMetadata = Metadata.newInstance(context);
+
picMetadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true);
+ handleEmbeddedResource(picIs, picMetadata, picName, null,
+ null, mediaType, xhtml, false);
}
}
}
@@ -656,4 +679,13 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
}
}
+ private String getDetectedMediaType(HSLFObjectData objectData) {
+ try (TikaInputStream tis =
TikaInputStream.get(objectData.getInputStream())) {
+ Metadata m = Metadata.newInstance(context);
+ MediaType mt = getDetector().detect(tis, m, context);
+ return mt.toString();
+ } catch (Exception e) {
+ return null;
+ }
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
index 46d7310274..6b9ac506c6 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
@@ -84,28 +84,35 @@ public class TNEFParser implements Parser {
MAPIAttribute attr =
msg.getMessageMAPIAttribute(MAPIProperty.RTF_COMPRESSED);
if (attr != null && attr instanceof MAPIRtfAttribute) {
MAPIRtfAttribute rtf = (MAPIRtfAttribute) attr;
- handleEmbedded("message.rtf", "application/rtf", rtf.getData(),
embeddedExtractor,
- xhtml, context);
+ handleEmbedded("message.rtf", "application/rtf", false,
+ rtf.getData(), embeddedExtractor, xhtml, context);
}
// Recurse into each attachment in turn
+ int unknownCount = 0;
for (Attachment attachment : msg.getAttachments()) {
String name = attachment.getLongFilename();
+ boolean inferredExtension = false;
if (name == null || name.isEmpty()) {
name = attachment.getFilename();
}
if (name == null || name.isEmpty()) {
String ext = attachment.getExtension();
- if (ext != null) {
- name = "unknown" + ext;
+ if (ext == null) {
+ ext = "";
}
+ name =
EmbeddedDocumentUtil.EmbeddedResourcePrefix.EMBEDDED.getPrefix()
+ + "-" + unknownCount++ + ext;
+ inferredExtension = true;
}
- handleEmbedded(name, null, attachment.getContents(),
embeddedExtractor, xhtml, context);
+ handleEmbedded(name, null, inferredExtension,
+ attachment.getContents(), embeddedExtractor, xhtml,
context);
}
xhtml.endDocument();
}
- private void handleEmbedded(String name, String type, byte[] contents,
+ private void handleEmbedded(String name, String type, boolean
inferredExtension,
+ byte[] contents,
EmbeddedDocumentExtractor embeddedExtractor,
ContentHandler handler,
ParseContext context)
throws IOException, SAXException, TikaException {
@@ -116,6 +123,9 @@ public class TNEFParser implements Parser {
if (type != null) {
metadata.set(Metadata.CONTENT_TYPE, type);
}
+ if (inferredExtension) {
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED,
true);
+ }
if (embeddedExtractor.shouldParseEmbedded(metadata)) {
try (TikaInputStream tis = TikaInputStream.get(contents)) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index ad61955e51..a08eaa0fba 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -60,6 +60,7 @@ import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -599,7 +600,8 @@ public class WordExtractor extends AbstractPOIFSExtractor {
// Make up a name for the picture
// There isn't one in the file, but we need to be able to reference
// the picture from the img tag and the embedded resource
- String filename = "image" + pictureNumber + (extension.length() > 0 ?
"." + extension : "");
+ String filename =
EmbeddedDocumentUtil.EmbeddedResourcePrefix.IMAGE.getPrefix()
+ + "-" + pictureNumber + (extension.length() > 0 ? "." +
extension : "");
// Grab the mime type for the picture
String mimeType = picture.getMimeType();
@@ -615,7 +617,10 @@ public class WordExtractor extends AbstractPOIFSExtractor {
// (Only expose each individual image once)
if (!pictures.hasOutput(picture)) {
TikaInputStream tis = TikaInputStream.get(picture.getContent());
- handleEmbeddedResource(tis, filename, null, mimeType, xhtml,
false);
+ Metadata picMetadata = Metadata.newInstance(context);
+
picMetadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true);
+ handleEmbeddedResource(tis, picMetadata, filename, null,
+ null, mimeType, xhtml, false);
pictures.recordOutput(picture);
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java
index 66495f4656..5b52f19acb 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java
@@ -224,12 +224,16 @@ class RTFEmbObjHandler {
String extension = embeddedDocumentUtil.getExtension(tis,
metadata);
if (inObject && state == EMB_STATE.PICT) {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
- "thumbnail_" + thumbCount++ + extension);
+
EmbeddedDocumentUtil.EmbeddedResourcePrefix.THUMBNAIL.getPrefix()
+ + "-" + thumbCount++ + extension);
metadata.set(RTFMetadata.THUMBNAIL, "true");
} else {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
- "file_" +
unknownFilenameCount.getAndIncrement() + extension);
+
EmbeddedDocumentUtil.EmbeddedResourcePrefix.EMBEDDED.getPrefix()
+ + "-" +
unknownFilenameCount.getAndIncrement()
+ + extension);
}
+
metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true);
}
try {
embeddedDocumentUtil
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java
index 9b7e2e9ba1..ff4c12061e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java
@@ -198,9 +198,10 @@ class RTFObjDataParser {
memoryLimitInKb * 1024);
}
ret = out.toByteArray();
- metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
- "file_" + unknownFilenameCount.getAndIncrement() +
"." +
- type.getExtension());
+ EmbeddedDocumentUtil.setGeneratedResourceName(metadata,
+
EmbeddedDocumentUtil.EmbeddedResourcePrefix.EMBEDDED,
+ unknownFilenameCount.getAndIncrement(),
+ type.getType().toString());
metadata.set(Metadata.CONTENT_TYPE,
type.getType().toString());
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
index 883899fa14..7a661ccf0e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
@@ -93,7 +93,7 @@ public class POIContainerExtractionTest extends
AbstractPOIContainerExtractionTe
assertEquals(1, handler.filenames.size());
assertEquals(1, handler.mediaTypes.size());
- assertEquals("image1.png", handler.filenames.get(0));
+ assertEquals("image-1.png", handler.filenames.get(0));
assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
@@ -102,9 +102,9 @@ public class POIContainerExtractionTest extends
AbstractPOIContainerExtractionTe
assertEquals(3, handler.filenames.size());
assertEquals(3, handler.mediaTypes.size());
- assertEquals("image1.png", handler.filenames.get(0));
- assertEquals("image2.jpg", handler.filenames.get(1));
- assertEquals("image3.png", handler.filenames.get(2));
+ assertEquals("image-1.png", handler.filenames.get(0));
+ assertEquals("image-2.jpg", handler.filenames.get(1));
+ assertEquals("image-3.png", handler.filenames.get(2));
assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 60d3012d87..15a6dc2a08 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.microsoft;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Arrays;
import java.util.List;
@@ -349,11 +350,11 @@ public class PowerPointParserTest extends TikaTest {
assertContains("tika", content);
assertContains("MyTitle", content);
- assertEquals("/embedded-1",
-
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+ String path1 =
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
+ assertTrue(path1.contains("."), "embedded resource should have
extension: " + path1);
- assertEquals("/embedded-2",
-
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+ String path2 =
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
+ assertTrue(path2.contains("."), "embedded resource should have
extension: " + path2);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 2e15eb0639..9710036fde 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -113,9 +113,9 @@ public class WordParserTest extends TikaTest {
xml = getXML("testWORD_3imgs.doc").xml;
// Images 1-3
- assertTrue(xml.contains("src=\"embedded:image1.png\""), "Image not
found in:\n" + xml);
- assertTrue(xml.contains("src=\"embedded:image2.jpg\""), "Image not
found in:\n" + xml);
- assertTrue(xml.contains("src=\"embedded:image3.png\""), "Image not
found in:\n" + xml);
+ assertTrue(xml.contains("src=\"embedded:image-1.png\""), "Image not
found in:\n" + xml);
+ assertTrue(xml.contains("src=\"embedded:image-2.jpg\""), "Image not
found in:\n" + xml);
+ assertTrue(xml.contains("src=\"embedded:image-3.png\""), "Image not
found in:\n" + xml);
// Text too
assertTrue(xml.contains("<p>The end!"));
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
index e34de47f48..72f95d8a1d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
@@ -387,7 +387,8 @@ public class ImageGraphicsEngine extends
PDFGraphicsStreamEngine {
//this is the metadata for this particular image
Metadata metadata = Metadata.newInstance(parseContext);
String suffix = getSuffix(pdImage, metadata);
- String fileName = "image" + imageNumber + "." + suffix;
+ String fileName =
EmbeddedDocumentUtil.EmbeddedResourcePrefix.IMAGE.getPrefix()
+ + "-" + imageNumber + "." + suffix;
AttributesImpl attr = new AttributesImpl();
@@ -398,6 +399,7 @@ public class ImageGraphicsEngine extends
PDFGraphicsStreamEngine {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED,
true);
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
metadata.set(TikaPagedText.PAGE_NUMBER, pageNumber);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 01a7f85aae..653b0e8d62 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -846,7 +846,7 @@ public class PDFParserTest extends TikaTest {
//regular attachment
assertContains("<div source=\"attachment\" class=\"embedded\"
id=\"Unit10.doc\" />", r.xml);
//inline image
- assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\"
/>", r.xml);
+ assertContains("<img src=\"embedded:image-1.tif\" alt=\"image-1.tif\"
/>", r.xml);
//doc embedded inside an annotation
r = getXML("testPDFFileEmbInAnnotation.pdf");
@@ -1336,11 +1336,11 @@ public class PDFParserTest extends TikaTest {
assertNull(context.get(ZeroByteFileException.IgnoreZeroByteFileException.class));
assertEquals(2, metadataList.size());
assertEquals("image/png",
metadataList.get(1).get(Metadata.CONTENT_TYPE));
- assertEquals("/image0.png",
+ assertEquals("/image-0.png",
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
assertEquals(261, (int)
metadataList.get(1).getInt(Metadata.IMAGE_LENGTH));
assertEquals(934, (int)
metadataList.get(1).getInt(Metadata.IMAGE_WIDTH));
- assertEquals("image0.png",
metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals("image-0.png",
metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
}
/**
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
index 13f411d068..cae0d63310 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
@@ -17,7 +17,7 @@
package org.apache.tika.parser.microsoft;
import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
import org.junit.jupiter.api.Test;
@@ -77,10 +77,10 @@ public class POIContainerExtractionTest extends
AbstractPOIContainerExtractionTe
assertEquals("1", handler.filenames.get(4));
assertEquals(null, handler.filenames.get(5));
assertEquals("2", handler.filenames.get(6));
- assertEquals("image1.png", handler.filenames.get(7));
- assertEquals("image2.jpg", handler.filenames.get(8));
- assertEquals("image3.png", handler.filenames.get(9));
- assertEquals("image1.png", handler.filenames.get(16));
+ assertEquals("image-1.png", handler.filenames.get(7));
+ assertEquals("image-2.jpg", handler.filenames.get(8));
+ assertEquals("image-3.png", handler.filenames.get(9));
+ assertEquals("image-1.png", handler.filenames.get(16));
assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded
office doc
assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded
office doc
@@ -102,12 +102,12 @@ public class POIContainerExtractionTest extends
AbstractPOIContainerExtractionTe
// Filenames are a bit iffy...
// Should really be 3*embedded pictures then 3*icons then embedded docs
- assertEquals("image1.emf", handler.filenames.get(0));
- assertEquals("image4.png", handler.filenames.get(1));
- assertEquals("image5.jpg", handler.filenames.get(2));
- assertEquals("image6.png", handler.filenames.get(3));
- assertEquals("image2.emf", handler.filenames.get(4));
- assertEquals("image3.emf", handler.filenames.get(5));
+ assertEquals("image-1.emf", handler.filenames.get(0));
+ assertEquals("image-4.png", handler.filenames.get(1));
+ assertEquals("image-5.jpg", handler.filenames.get(2));
+ assertEquals("image-6.png", handler.filenames.get(3));
+ assertEquals("image-2.emf", handler.filenames.get(4));
+ assertEquals("image-3.emf", handler.filenames.get(5));
assertEquals(null, handler.filenames.get(6));
assertEquals("_1345471035.ppt", handler.filenames.get(7));
assertEquals("_1345470949.xls", handler.filenames.get(8));
@@ -130,18 +130,19 @@ public class POIContainerExtractionTest extends
AbstractPOIContainerExtractionTe
assertEquals(16, handler.mediaTypes.size());
// We don't know their filenames, except for doc images + docx
- assertEquals("image1.emf", handler.filenames.get(0));
- assertEquals("image4.png", handler.filenames.get(1));
- assertEquals("image5.jpg", handler.filenames.get(2));
- assertEquals("image6.png", handler.filenames.get(3));
- assertEquals("image2.emf", handler.filenames.get(4));
- assertEquals("image3.emf", handler.filenames.get(5));
+ assertEquals("image-1.emf", handler.filenames.get(0));
+ assertEquals("image-4.png", handler.filenames.get(1));
+ assertEquals("image-5.jpg", handler.filenames.get(2));
+ assertEquals("image-6.png", handler.filenames.get(3));
+ assertEquals("image-2.emf", handler.filenames.get(4));
+ assertEquals("image-3.emf", handler.filenames.get(5));
assertEquals(null, handler.filenames.get(6));
assertEquals("image2.png", handler.filenames.get(7));
assertEquals("image3.jpeg", handler.filenames.get(8));
assertEquals("image4.png", handler.filenames.get(9));
+ // PPT slide images now get generated names
for (int i = 11; i < 14; i++) {
- assertNull(handler.filenames.get(i));
+ assertNotNull(handler.filenames.get(i));
}
// But we do know their types
assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded
office doc
@@ -164,14 +165,12 @@ public class POIContainerExtractionTest extends
AbstractPOIContainerExtractionTe
assertEquals(7, handler.filenames.size());
assertEquals(7, handler.mediaTypes.size());
- // We don't get all that helpful filenames
+ // Embedded objects get OLE IDs, slide images now get generated names
assertEquals("1", handler.filenames.get(0));
assertEquals("2", handler.filenames.get(1));
- assertEquals(null, handler.filenames.get(2));
- assertEquals(null, handler.filenames.get(3));
- assertEquals(null, handler.filenames.get(4));
- assertEquals(null, handler.filenames.get(5));
- assertEquals(null, handler.filenames.get(6));
+ for (int i = 2; i < 7; i++) {
+ assertNotNull(handler.filenames.get(i));
+ }
// But we do know their types
assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office
doc
assertEquals(TYPE_DOC, handler.mediaTypes.get(1)); // Embedded office
doc
@@ -189,14 +188,13 @@ public class POIContainerExtractionTest extends
AbstractPOIContainerExtractionTe
assertEquals("1", handler.filenames.get(0));
assertEquals(null, handler.filenames.get(1));
assertEquals("2", handler.filenames.get(2));
- assertEquals("image1.png", handler.filenames.get(3));
- assertEquals("image2.jpg", handler.filenames.get(4));
- assertEquals("image3.png", handler.filenames.get(5));
- assertEquals(null, handler.filenames.get(6));
- assertEquals(null, handler.filenames.get(7));
- assertEquals(null, handler.filenames.get(8));
- assertEquals(null, handler.filenames.get(9));
- assertEquals(null, handler.filenames.get(10));
+ assertEquals("image-1.png", handler.filenames.get(3));
+ assertEquals("image-2.jpg", handler.filenames.get(4));
+ assertEquals("image-3.png", handler.filenames.get(5));
+ // PPT slide images now get generated names
+ for (int i = 6; i < 11; i++) {
+ assertNotNull(handler.filenames.get(i));
+ }
assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office
doc
assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // PNG inside
.xls
@@ -216,7 +214,7 @@ public class POIContainerExtractionTest extends
AbstractPOIContainerExtractionTe
assertEquals(2, handler.filenames.size());
assertEquals(2, handler.mediaTypes.size());
- assertEquals("image1.emf", handler.filenames.get(0));
+ assertEquals("image-1.emf", handler.filenames.get(0));
assertEquals("_1402837031.pdf", handler.filenames.get(1));
assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded
pdf
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
index 7bd45cf813..e3be158582 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
@@ -47,8 +47,8 @@ public class RTFParserTest extends TikaTest {
Map<Integer, Pair> expected = new HashMap<>();
expected.put(3, new Pair("Hw.txt", "text/plain;
charset=windows-1252"));
- expected.put(4, new Pair("file_0.doc", "application/msword"));
- expected.put(7, new Pair("file_1.xlsx",
+ expected.put(4, new Pair("embedded-0.doc", "application/msword"));
+ expected.put(7, new Pair("embedded-1.xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
expected.put(10, new Pair("text.html", "text/html;
charset=windows-1252"));
expected.put(11, new Pair("html-within-zip.zip", "application/zip"));
@@ -57,17 +57,17 @@ public class RTFParserTest extends TikaTest {
expected.put(15, new
Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html",
"text/html; charset=UTF-8"));
expected.put(18, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg",
"image/jpeg"));
- expected.put(21, new Pair("file_2.xls", "application/vnd.ms-excel"));
+ expected.put(21, new Pair("embedded-2.xls",
"application/vnd.ms-excel"));
expected.put(24,
new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg",
"application/vnd.ms-outlook"));
- expected.put(27, new Pair("file_3.pdf", "application/pdf"));
- expected.put(30, new Pair("file_4.ppt",
"application/vnd.ms-powerpoint"));
- expected.put(34, new Pair("file_5.pptx",
+ expected.put(27, new Pair("embedded-3.pdf", "application/pdf"));
+ expected.put(30, new Pair("embedded-4.ppt",
"application/vnd.ms-powerpoint"));
+ expected.put(34, new Pair("embedded-5.pptx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation"));
expected.put(33, new Pair("thumbnail.jpeg", "image/jpeg"));
- expected.put(37, new Pair("file_6.doc", "application/msword"));
- expected.put(40, new Pair("file_7.doc", "application/msword"));
- expected.put(43, new Pair("file_8.docx",
+ expected.put(37, new Pair("embedded-6.doc", "application/msword"));
+ expected.put(40, new Pair("embedded-7.doc", "application/msword"));
+ expected.put(43, new Pair("embedded-8.docx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
expected.put(46, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg",
"image/jpeg"));
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 76910c56ff..fa160184b8 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -244,7 +244,7 @@ public class PDFParserTest extends TikaTest {
List<Metadata> metadatas = handler.getMetadataList();
assertEquals(5, metadatas.size());
assertNull(metadatas.get(0).get(TikaCoreProperties.RESOURCE_NAME_KEY));
- assertEquals("image0.jpg",
metadatas.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals("image-0.jpg",
metadatas.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertEquals("Press Quality(1).joboptions",
metadatas.get(3).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertEquals("Unit10.doc",
metadatas.get(4).get(TikaCoreProperties.RESOURCE_NAME_KEY));
@@ -367,7 +367,7 @@ public class PDFParserTest extends TikaTest {
assertEquals("352",
metadatas.get(1).get(ImageMetadataExtractor.UNKNOWN_IMG_NS + "width"));
assertEquals("testPDF_JBIG2.pdf",
metadatas.get(0).get(TikaCoreProperties.RESOURCE_NAME_KEY));
- assertEquals("image0.jb2",
metadatas.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals("image-0.jb2",
metadatas.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertEquals(MediaType.image("x-jbig2").toString(),
metadatas.get(1).get(Metadata.CONTENT_TYPE));
}
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/StandardUnpackSelector.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/StandardUnpackSelector.java
index 55d92046d5..0bd3650aed 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/StandardUnpackSelector.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/StandardUnpackSelector.java
@@ -20,6 +20,7 @@ import java.util.HashSet;
import java.util.Set;
import org.apache.tika.config.TikaComponent;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.extractor.UnpackSelector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -77,7 +78,7 @@ public class StandardUnpackSelector implements UnpackSelector
{
}
// Also compute normalized mime for OCR types (image/ocr-jpeg ->
image/jpeg)
- String normalizedMime = normalizeOcrType(mime);
+ String normalizedMime = EmbeddedDocumentUtil.normalizeMediaType(mime);
if (excludeMimeTypes.contains(mime) ||
excludeMimeTypes.contains(normalizedMime)) {
return false;
@@ -136,16 +137,6 @@ public class StandardUnpackSelector implements
UnpackSelector {
this.excludeEmbeddedResourceTypes = new
HashSet<>(excludeEmbeddedResourceTypes);
}
- /**
- * Normalize OCR media types (e.g., image/ocr-jpeg -> image/jpeg).
- * These are internal routing types used by AbstractImageParser for
tesseract delegation.
- */
- private static String normalizeOcrType(String mime) {
- if (mime != null && mime.startsWith("image/ocr-")) {
- return "image/" + mime.substring("image/ocr-".length());
- }
- return mime;
- }
@Override
public String toString() {
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
index c97c1311df..cd02d99767 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
@@ -36,6 +36,7 @@ import org.apache.tika.digest.SkipContainerDocumentDigest;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.WriteLimitReachedException;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.extractor.UnpackHandler;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -149,7 +150,8 @@ class ParseHandler {
parseContext.set(ParsingIntent.class, ParsingIntent.WILL_PARSE);
try {
MediaType mt = detector.detect(tis, metadata, parseContext);
- metadata.set(Metadata.CONTENT_TYPE, mt.toString());
+ metadata.set(Metadata.CONTENT_TYPE,
+ EmbeddedDocumentUtil.normalizeMediaType(mt.toString()));
metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
mt.toString());
} catch (IOException e) {
LOG.warn("problem detecting: " + t.getId(), e);