This is an automated email from the ASF dual-hosted git repository.
tilman pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_2x by this push:
new c95697127f TIKA-4614: add Media Management metadata extraction, avoid
NPE, add test
c95697127f is described below
commit c95697127f1ea4d0c82ceea51f7af368d983ccb2
Author: Tilman Hausherr <[email protected]>
AuthorDate: Thu Jan 8 11:11:29 2026 +0100
TIKA-4614: add Media Management metadata extraction, avoid NPE, add test
---
.../tika/parser/xmp/XMPMetadataExtractor.java | 118 ++++++++++++---
.../tika/parser/xmp/XmpboxExtractorTest.java | 167 +++++++++++++++++++++
2 files changed, 266 insertions(+), 19 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPMetadataExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPMetadataExtractor.java
index a64fb10882..4f1f656eca 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPMetadataExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPMetadataExtractor.java
@@ -18,14 +18,19 @@ package org.apache.tika.parser.xmp;
import java.io.IOException;
import java.io.InputStream;
-import java.util.Date;
+import java.util.Calendar;
import java.util.List;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.xmpbox.XMPMetadata;
import org.apache.xmpbox.schema.DublinCoreSchema;
import org.apache.xmpbox.schema.XMPBasicSchema;
+import org.apache.xmpbox.schema.XMPMediaManagementSchema;
+import org.apache.xmpbox.type.AbstractField;
+import org.apache.xmpbox.type.ArrayProperty;
import org.apache.xmpbox.type.BadFieldValueException;
+import org.apache.xmpbox.type.ResourceEventType;
+import org.apache.xmpbox.type.ResourceRefType;
import org.apache.xmpbox.xml.DomXmpParser;
import org.apache.tika.exception.TikaException;
@@ -33,12 +38,16 @@ import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.XMP;
+import org.apache.tika.metadata.XMPMM;
+import org.apache.tika.utils.DateUtils;
/**
* XMP Metadata Extractor based on Apache XmpBox.
*/
public class XMPMetadataExtractor {
+ private static volatile int MAX_EVENT_HISTORY_IN_XMPMM = 1024;
+
/**
* Parse the XMP Packets.
*
@@ -59,6 +68,7 @@ public class XMPMetadataExtractor {
}
extractDublinCoreSchema(xmp, metadata);
extractXMPBasicSchema(xmp, metadata);
+ extractXMPMM(xmp, metadata);
}
/**
@@ -67,18 +77,13 @@ public class XMPMetadataExtractor {
* Silently swallows exceptions.
* @param xmp the XMP Metadata object.
* @param metadata the metadata map
+ * @throws IOException
*/
public static void extractDublinCoreSchema(XMPMetadata xmp, Metadata
metadata) throws IOException {
if (xmp == null) {
return;
}
- DublinCoreSchema schemaDublinCore;
- try {
- schemaDublinCore = xmp.getDublinCoreSchema();
- } catch (Throwable e) {
- // Swallow
- return;
- }
+ DublinCoreSchema schemaDublinCore = xmp.getDublinCoreSchema();
if (schemaDublinCore != null) {
try {
addMetadata(metadata, DublinCore.TITLE,
schemaDublinCore.getTitle());
@@ -102,27 +107,102 @@ public class XMPMetadataExtractor {
* Silently swallows exceptions.
* @param xmp the XMP Metadata object.
* @param metadata the metadata map
+ * @throws IOException
*/
public static void extractXMPBasicSchema(XMPMetadata xmp, Metadata
metadata) throws IOException {
if (xmp == null) {
return;
}
- XMPBasicSchema schemaBasic;
- try {
- schemaBasic = xmp.getXMPBasicSchema();
- } catch (Throwable e) {
- // Swallow
- return;
- }
+ XMPBasicSchema schemaBasic = xmp.getXMPBasicSchema();
if (schemaBasic != null) {
addMetadata(metadata, XMP.CREATOR_TOOL,
schemaBasic.getCreatorTool());
- addMetadata(metadata, XMP.CREATE_DATE,
schemaBasic.getCreateDate().getTime());
- addMetadata(metadata, XMP.MODIFY_DATE,
schemaBasic.getModifyDate().getTime());
- addMetadata(metadata, XMP.METADATA_DATE,
schemaBasic.getModifyDate().getTime());
+ addMetadata(metadata, XMP.CREATE_DATE,
schemaBasic.getCreateDate());
+ addMetadata(metadata, XMP.MODIFY_DATE,
schemaBasic.getModifyDate());
+ addMetadata(metadata, XMP.METADATA_DATE,
schemaBasic.getModifyDate());
addMetadata(metadata, XMP.RATING, schemaBasic.getRating());
}
}
+ /**
+ * @return maximum number of events to extract from the XMPMM history.
+ */
+ public static int getMaxXMPMMHistory() {
+ return MAX_EVENT_HISTORY_IN_XMPMM;
+ }
+
+ /**
+ * Maximum number of events to extract from the
+ * event history in the XMP Media Management (XMPMM) section.
+ * The extractor will silently stop adding events after it
+ * has reached this threshold.
+ * <p>
+ * The default is 1024.
+ * @param maxEvents
+ */
+ public static void setMaxXMPMMHistory(int maxEvents) {
+ MAX_EVENT_HISTORY_IN_XMPMM = maxEvents;
+ }
+
+ /**
+ * Extracts Media Management metadata from XMP.
+ * <p>
+ * Silently swallows exceptions.
+ *
+ * @param xmp
+ * @param metadata
+ */
+ public static void extractXMPMM(XMPMetadata xmp, Metadata metadata) {
+ if (xmp == null) {
+ return;
+ }
+ XMPMediaManagementSchema mmSchema = xmp.getXMPMediaManagementSchema();
+ if (mmSchema != null) {
+ addMetadata(metadata, XMPMM.DOCUMENTID, mmSchema.getDocumentID());
+ metadata.set(XMPMM.INSTANCEID, mmSchema.getInstanceID());
+ metadata.set(XMPMM.ORIGINAL_DOCUMENTID,
mmSchema.getOriginalDocumentID());
+
+ //ResourceRefType derivedFrom = mmSchema.getDerivedFromProperty();
//TODO after XMPBox 3.0.7
+ ResourceRefType derivedFrom = mmSchema.getResourceRefProperty();
+
+ if (derivedFrom != null) {
+ addMetadata(metadata, XMPMM.DERIVED_FROM_DOCUMENTID,
derivedFrom.getDocumentID());
+ addMetadata(metadata, XMPMM.DERIVED_FROM_INSTANCEID,
derivedFrom.getInstanceID());
+ }
+ ArrayProperty historyProperty = mmSchema.getHistoryProperty();
+ if (historyProperty != null) {
+ int eventsAdded = 0;
+ for (AbstractField af : historyProperty.getAllProperties()) {
+ if (eventsAdded >= MAX_EVENT_HISTORY_IN_XMPMM) {
+ break;
+ }
+ if (!(af instanceof ResourceEventType))
+ {
+ continue;
+ }
+ ResourceEventType stevt = (ResourceEventType) af;
+ String instanceId = stevt.getInstanceID();
+ String action = stevt.getAction();
+ Calendar when = stevt.getWhen();
+ String softwareAgent = stevt.getSoftwareAgent();
+ if (instanceId != null && !instanceId.isBlank())
+ {
+ // for absent data elements, pass in empty strings so
+ // that parallel arrays will have matching offsets for
absent data
+ action = action == null ? "" : action;
+ String dateString = when == null ? "" :
DateUtils.formatDate(when);
+ softwareAgent = softwareAgent == null ? "" :
softwareAgent;
+
+ metadata.add(XMPMM.HISTORY_EVENT_INSTANCEID,
instanceId);
+ metadata.add(XMPMM.HISTORY_ACTION, action);
+ metadata.add(XMPMM.HISTORY_WHEN, dateString);
+ metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT,
softwareAgent);
+ eventsAdded++;
+ }
+ }
+ }
+ }
+ }
+
/**
* Add list to the metadata map.
*
@@ -179,7 +259,7 @@ public class XMPMetadataExtractor {
* @param property the property to add.
* @param value the value to add.
*/
- private static void addMetadata(Metadata metadata, Property property, Date
value) {
+ private static void addMetadata(Metadata metadata, Property property,
Calendar value) {
if (value != null) {
metadata.set(property, value);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/XmpboxExtractorTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/XmpboxExtractorTest.java
new file mode 100644
index 0000000000..b56e9b3be5
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/XmpboxExtractorTest.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xmp;
+
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPMM;
+
+/**
+ *
+ * @author Tilman Hausherr
+ */
+@Disabled //TODO enable with XMPBox 3.0.7
+public class XmpboxExtractorTest extends TikaTest {
+
+ private final XMPPacketScanner scanner = new XMPPacketScanner();
+
+ @Test // parsing fails because of bad date "2010-07-28T11:02:12.000CEST" =
UTC+02:00
+ public void testParseJpeg() throws IOException, TikaException {
+ Metadata metadata = new Metadata();
+ try (TikaInputStream tis =
getResourceAsStream("/test-documents/testJPEG_commented.jpg")) {
+ UnsynchronizedByteArrayOutputStream xmpraw =
UnsynchronizedByteArrayOutputStream.builder().get();
+ boolean parsed = scanner.parse(tis, xmpraw);
+ assertTrue(parsed);
+
+ // set some values before extraction to see that they are
overridden
+
+ //TODO this doesn't work here because this extractor uses
addMetadata() which works
+ // differently than metadata.set(). We may want to fix one or the
other.
+// metadata.set(TikaCoreProperties.TITLE, "old title");
+// metadata.set(TikaCoreProperties.DESCRIPTION, "old description");
+// metadata.set(TikaCoreProperties.CREATOR, "previous author");
+ // ... or kept in case the field is multi-value
+ metadata.add(TikaCoreProperties.SUBJECT, "oldkeyword");
+
+ // xmpbox fails parsing on bad dates
+ String s = xmpraw.toString(StandardCharsets.UTF_8);
+ s = s.replace("CEST\"", "+02:00\"");
+
+ XMPMetadataExtractor.parse(new
ByteArrayInputStream(s.getBytes(StandardCharsets.UTF_8)), metadata);
+
+ // DublinCore fields
+ assertEquals("Tosteberga \u00C4ngar",
metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new
line)",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Some Tourist",
metadata.get(TikaCoreProperties.CREATOR));
+ Collection<String> keywords =
+
Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT));
+ assertTrue(keywords.contains("oldkeyword"));
+ assertTrue(keywords.contains("grazelands"));
+ assertTrue(keywords.contains("nature reserve"));
+ assertTrue(keywords.contains("bird watching"));
+ assertTrue(keywords.contains("coast"));
+ }
+ }
+
+ @Test
+ public void testParseJpegPhotoshop() throws IOException, TikaException {
+ Metadata metadata = new Metadata();
+ try (TikaInputStream tis = getResourceAsStream(
+ "/test-documents/testJPEG_commented_pspcs2mac.jpg")) {
+ UnsynchronizedByteArrayOutputStream xmpraw =
UnsynchronizedByteArrayOutputStream.builder().get();
+ boolean parsed = scanner.parse(tis, xmpraw);
+ assertTrue(parsed);
+
+ try (InputStream is = xmpraw.toInputStream()) {
+ XMPMetadataExtractor.parse(is, metadata);
+ }
+
+ // DublinCore fields
+ assertEquals("Tosteberga \u00C4ngar",
metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new
line)",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Some Tourist",
metadata.get(TikaCoreProperties.CREATOR));
+ Collection<String> keywords =
+
Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT));
+ assertTrue(keywords.contains("bird watching"));
+ assertTrue(keywords.contains("coast"));
+ }
+ }
+
+ @Test
+ public void testParseJpegXnviewmp() throws IOException, TikaException {
+ Metadata metadata = new Metadata();
+ try (TikaInputStream tis = getResourceAsStream(
+ "/test-documents/testJPEG_commented_xnviewmp026.jpg")) {
+ UnsynchronizedByteArrayOutputStream xmpraw =
UnsynchronizedByteArrayOutputStream.builder().get();
+ boolean parsed = scanner.parse(tis, xmpraw);
+ assertTrue(parsed);
+
+ try (InputStream is = xmpraw.toInputStream()) {
+ XMPMetadataExtractor.parse(is, metadata);
+ }
+
+ assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new
line)",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ Collection<String> keywords =
+
Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT));
+ assertTrue(keywords.contains("coast"));
+ assertTrue(keywords.contains("nature reserve"));
+ }
+ }
+
+ @Test
+ public void testMaxXMPMMHistory() throws Exception {
+ Metadata metadata = new Metadata();
+ int maxHistory = XMPMetadataExtractor.getMaxXMPMMHistory();
+ try {
+ try (TikaInputStream tis =
getResourceAsStream("/test-documents/testXMP.xmp")) {
+ UnsynchronizedByteArrayOutputStream xmpraw =
UnsynchronizedByteArrayOutputStream.builder().get();
+ boolean parsed = scanner.parse(tis, xmpraw);
+ assertTrue(parsed);
+
+ try (InputStream is = xmpraw.toInputStream()) {
+ XMPMetadataExtractor.parse(is, metadata);
+ }
+
+ assertEquals(7,
metadata.getValues(XMPMM.HISTORY_EVENT_INSTANCEID).length);
+
+ XMPMetadataExtractor.setMaxXMPMMHistory(5);
+ metadata = new Metadata();
+ try (InputStream is = xmpraw.toInputStream()) {
+ XMPMetadataExtractor.parse(is, metadata);
+ }
+
+ assertEquals(5,
metadata.getValues(XMPMM.HISTORY_EVENT_INSTANCEID).length);
+ }
+ }
+ finally {
+ //if something goes wrong, make sure to set this back to what it
was
+ XMPMetadataExtractor.setMaxXMPMMHistory(maxHistory);
+ }
+ }
+
+}