This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 2f7b46d7db TIKA-4614: add Media Management metadata extraction, avoid 
NPE, add test
2f7b46d7db is described below

commit 2f7b46d7db85763a4f18e7e4ba8dbcf2aa09012e
Author: Tilman Hausherr <[email protected]>
AuthorDate: Thu Jan 8 11:11:29 2026 +0100

    TIKA-4614: add Media Management metadata extraction, avoid NPE, add test
---
 .../tika/parser/xmp/XMPMetadataExtractor.java      | 118 ++++++++++++---
 .../tika/parser/xmp/XmpboxExtractorTest.java       | 167 +++++++++++++++++++++
 2 files changed, 266 insertions(+), 19 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPMetadataExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPMetadataExtractor.java
index a26d408e9a..60c9797899 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPMetadataExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPMetadataExtractor.java
@@ -18,14 +18,19 @@ package org.apache.tika.parser.xmp;
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.Date;
+import java.util.Calendar;
 import java.util.List;
 
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.xmpbox.XMPMetadata;
 import org.apache.xmpbox.schema.DublinCoreSchema;
 import org.apache.xmpbox.schema.XMPBasicSchema;
+import org.apache.xmpbox.schema.XMPMediaManagementSchema;
+import org.apache.xmpbox.type.AbstractField;
+import org.apache.xmpbox.type.ArrayProperty;
 import org.apache.xmpbox.type.BadFieldValueException;
+import org.apache.xmpbox.type.ResourceEventType;
+import org.apache.xmpbox.type.ResourceRefType;
 import org.apache.xmpbox.xml.DomXmpParser;
 
 import org.apache.tika.exception.TikaException;
@@ -33,12 +38,16 @@ import org.apache.tika.metadata.DublinCore;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.XMP;
+import org.apache.tika.metadata.XMPMM;
+import org.apache.tika.utils.DateUtils;
 
 /**
  * XMP Metadata Extractor based on Apache XmpBox.
  */
 public class XMPMetadataExtractor {
 
+    private static volatile int MAX_EVENT_HISTORY_IN_XMPMM = 1024;
+
     /**
      * Parse the XMP Packets.
      *
@@ -59,6 +68,7 @@ public class XMPMetadataExtractor {
         }
         extractDublinCoreSchema(xmp, metadata);
         extractXMPBasicSchema(xmp, metadata);
+        extractXMPMM(xmp, metadata);
     }
 
     /**
@@ -67,18 +77,13 @@ public class XMPMetadataExtractor {
      * Silently swallows exceptions.
      * @param xmp the XMP Metadata object.
      * @param metadata the metadata map
+     * @throws IOException
      */
     public static void extractDublinCoreSchema(XMPMetadata xmp, Metadata 
metadata) throws IOException {
         if (xmp == null) {
             return;
         }
-        DublinCoreSchema schemaDublinCore;
-        try {
-            schemaDublinCore = xmp.getDublinCoreSchema();
-        } catch (Throwable e) {
-            // Swallow
-            return;
-        }
+        DublinCoreSchema schemaDublinCore = xmp.getDublinCoreSchema();
         if (schemaDublinCore != null) {
             try {
                 addMetadata(metadata, DublinCore.TITLE, 
schemaDublinCore.getTitle());
@@ -99,27 +104,102 @@ public class XMPMetadataExtractor {
      * Silently swallows exceptions.
      * @param xmp the XMP Metadata object.
      * @param metadata the metadata map
+     * @throws IOException
      */
     public static void extractXMPBasicSchema(XMPMetadata xmp, Metadata 
metadata) throws IOException {
         if (xmp == null) {
             return;
         }
-        XMPBasicSchema schemaBasic;
-        try {
-            schemaBasic = xmp.getXMPBasicSchema();
-        } catch (Throwable e) {
-            // Swallow
-            return;
-        }
+        XMPBasicSchema schemaBasic = xmp.getXMPBasicSchema();
         if (schemaBasic != null) {
             addMetadata(metadata, XMP.CREATOR_TOOL, 
schemaBasic.getCreatorTool());
-            addMetadata(metadata, XMP.CREATE_DATE, 
schemaBasic.getCreateDate().getTime());
-            addMetadata(metadata, XMP.MODIFY_DATE, 
schemaBasic.getModifyDate().getTime());
-            addMetadata(metadata, XMP.METADATA_DATE, 
schemaBasic.getModifyDate().getTime());
+            addMetadata(metadata, XMP.CREATE_DATE, 
schemaBasic.getCreateDate());
+            addMetadata(metadata, XMP.MODIFY_DATE, 
schemaBasic.getModifyDate());
+            addMetadata(metadata, XMP.METADATA_DATE, 
schemaBasic.getModifyDate());
             addMetadata(metadata, XMP.RATING, schemaBasic.getRating());
         }
     }
 
+    /**
+     * @return maximum number of events to extract from the XMPMM history.
+     */
+    public static int getMaxXMPMMHistory() {
+        return MAX_EVENT_HISTORY_IN_XMPMM;
+    }
+
+    /**
+     * Maximum number of events to extract from the
+     * event history in the XMP Media Management (XMPMM) section.
+     * The extractor will silently stop adding events after it
+     * has reached this threshold.
+     * <p>
+     * The default is 1024.
+     * @param maxEvents
+     */
+    public static void setMaxXMPMMHistory(int maxEvents) {
+        MAX_EVENT_HISTORY_IN_XMPMM = maxEvents;
+    }
+
+    /**
+     * Extracts Media Management metadata from XMP.
+     * <p>
+     * Silently swallows exceptions.
+     *
+     * @param xmp
+     * @param metadata
+     */
+    public static void extractXMPMM(XMPMetadata xmp, Metadata metadata) {
+        if (xmp == null) {
+            return;
+        }
+        XMPMediaManagementSchema mmSchema = xmp.getXMPMediaManagementSchema();
+        if (mmSchema != null) {
+            addMetadata(metadata, XMPMM.DOCUMENTID, mmSchema.getDocumentID());
+            metadata.set(XMPMM.INSTANCEID, mmSchema.getInstanceID());
+            metadata.set(XMPMM.ORIGINAL_DOCUMENTID, 
mmSchema.getOriginalDocumentID());
+
+            //ResourceRefType derivedFrom = mmSchema.getDerivedFromProperty(); 
//TODO after XMPBox 3.0.7
+            ResourceRefType derivedFrom = mmSchema.getResourceRefProperty();
+            
+            if (derivedFrom != null) {
+                addMetadata(metadata, XMPMM.DERIVED_FROM_DOCUMENTID, 
derivedFrom.getDocumentID());
+                addMetadata(metadata, XMPMM.DERIVED_FROM_INSTANCEID, 
derivedFrom.getInstanceID());
+            }
+            ArrayProperty historyProperty = mmSchema.getHistoryProperty();
+            if (historyProperty != null) {
+                int eventsAdded = 0;
+                for (AbstractField af : historyProperty.getAllProperties()) {
+                    if (eventsAdded >= MAX_EVENT_HISTORY_IN_XMPMM) {
+                        break;
+                    }
+                    if (!(af instanceof ResourceEventType))
+                    {
+                        continue;
+                    }
+                    ResourceEventType stevt = (ResourceEventType) af;
+                    String instanceId = stevt.getInstanceID();
+                    String action = stevt.getAction();
+                    Calendar when = stevt.getWhen();
+                    String softwareAgent = stevt.getSoftwareAgent();
+                    if (instanceId != null && !instanceId.isBlank())
+                    {
+                        // for absent data elements, pass in empty strings so
+                        // that parallel arrays will have matching offsets for 
absent data
+                        action = action == null ? "" : action;
+                        String dateString = when == null ? "" : 
DateUtils.formatDate(when);
+                        softwareAgent = softwareAgent == null ? "" : 
softwareAgent;
+
+                        metadata.add(XMPMM.HISTORY_EVENT_INSTANCEID, 
instanceId);
+                        metadata.add(XMPMM.HISTORY_ACTION, action);
+                        metadata.add(XMPMM.HISTORY_WHEN, dateString);
+                        metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT, 
softwareAgent);
+                        eventsAdded++;
+                    }
+                }
+            }
+        }
+    }
+
     /**
      * Add list to the metadata map.
      *
@@ -176,7 +256,7 @@ public class XMPMetadataExtractor {
      * @param property the property to add.
      * @param value the value to add.
      */
-    private static void addMetadata(Metadata metadata, Property property, Date 
value) {
+    private static void addMetadata(Metadata metadata, Property property, 
Calendar value) {
         if (value != null) {
             metadata.set(property, value);
         }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/XmpboxExtractorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/XmpboxExtractorTest.java
new file mode 100644
index 0000000000..b56e9b3be5
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/XmpboxExtractorTest.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xmp;
+
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPMM;
+
+/**
+ *
+ * @author Tilman Hausherr
+ */
+@Disabled //TODO enable with XMPBox 3.0.7
+public class XmpboxExtractorTest extends TikaTest {
+
+    private final XMPPacketScanner scanner = new XMPPacketScanner();
+
+    @Test // parsing fails because of bad date "2010-07-28T11:02:12.000CEST" = 
UTC+02:00
+    public void testParseJpeg() throws IOException, TikaException {
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = 
getResourceAsStream("/test-documents/testJPEG_commented.jpg")) {
+            UnsynchronizedByteArrayOutputStream xmpraw = 
UnsynchronizedByteArrayOutputStream.builder().get();
+            boolean parsed = scanner.parse(tis, xmpraw);
+            assertTrue(parsed);
+
+            // set some values before extraction to see that they are 
overridden
+
+            //TODO this doesn't work here because this extractor uses 
addMetadata() which works
+            // differently than metadata.set(). We may want to fix one or the 
other.
+//            metadata.set(TikaCoreProperties.TITLE, "old title");
+//            metadata.set(TikaCoreProperties.DESCRIPTION, "old description");
+//            metadata.set(TikaCoreProperties.CREATOR, "previous author");
+            // ... or kept in case the field is multi-value
+            metadata.add(TikaCoreProperties.SUBJECT, "oldkeyword");
+
+            // xmpbox fails parsing on bad dates
+            String s = xmpraw.toString(StandardCharsets.UTF_8);
+            s = s.replace("CEST\"", "+02:00\"");
+
+            XMPMetadataExtractor.parse(new 
ByteArrayInputStream(s.getBytes(StandardCharsets.UTF_8)), metadata);
+
+            // DublinCore fields
+            assertEquals("Tosteberga \u00C4ngar", 
metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new 
line)",
+                    metadata.get(TikaCoreProperties.DESCRIPTION));
+            assertEquals("Some Tourist", 
metadata.get(TikaCoreProperties.CREATOR));
+            Collection<String> keywords =
+                    
Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT));
+            assertTrue(keywords.contains("oldkeyword"));
+            assertTrue(keywords.contains("grazelands"));
+            assertTrue(keywords.contains("nature reserve"));
+            assertTrue(keywords.contains("bird watching"));
+            assertTrue(keywords.contains("coast"));
+        }
+    }
+
+    @Test
+    public void testParseJpegPhotoshop() throws IOException, TikaException {
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = getResourceAsStream(
+                "/test-documents/testJPEG_commented_pspcs2mac.jpg")) {
+            UnsynchronizedByteArrayOutputStream xmpraw = 
UnsynchronizedByteArrayOutputStream.builder().get();
+            boolean parsed = scanner.parse(tis, xmpraw);
+            assertTrue(parsed);
+
+            try (InputStream is = xmpraw.toInputStream()) {
+                XMPMetadataExtractor.parse(is, metadata);
+            }
+
+            // DublinCore fields
+            assertEquals("Tosteberga \u00C4ngar", 
metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new 
line)",
+                    metadata.get(TikaCoreProperties.DESCRIPTION));
+            assertEquals("Some Tourist", 
metadata.get(TikaCoreProperties.CREATOR));
+            Collection<String> keywords =
+                    
Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT));
+            assertTrue(keywords.contains("bird watching"));
+            assertTrue(keywords.contains("coast"));
+        }
+    }
+
+    @Test
+    public void testParseJpegXnviewmp() throws IOException, TikaException {
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = getResourceAsStream(
+                "/test-documents/testJPEG_commented_xnviewmp026.jpg")) {
+            UnsynchronizedByteArrayOutputStream xmpraw = 
UnsynchronizedByteArrayOutputStream.builder().get();
+            boolean parsed = scanner.parse(tis, xmpraw);
+            assertTrue(parsed);
+
+            try (InputStream is = xmpraw.toInputStream()) {
+                XMPMetadataExtractor.parse(is, metadata);
+            }
+
+            assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new 
line)",
+                    metadata.get(TikaCoreProperties.DESCRIPTION));
+            Collection<String> keywords =
+                    
Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT));
+            assertTrue(keywords.contains("coast"));
+            assertTrue(keywords.contains("nature reserve"));
+        }
+    }
+
+    @Test
+    public void testMaxXMPMMHistory() throws Exception {
+        Metadata metadata = new Metadata();
+        int maxHistory = XMPMetadataExtractor.getMaxXMPMMHistory();
+        try {
+            try (TikaInputStream tis = 
getResourceAsStream("/test-documents/testXMP.xmp")) {
+                UnsynchronizedByteArrayOutputStream xmpraw = 
UnsynchronizedByteArrayOutputStream.builder().get();
+                boolean parsed = scanner.parse(tis, xmpraw);
+                assertTrue(parsed);
+
+                try (InputStream is = xmpraw.toInputStream()) {
+                    XMPMetadataExtractor.parse(is, metadata);
+                }
+
+                assertEquals(7, 
metadata.getValues(XMPMM.HISTORY_EVENT_INSTANCEID).length);
+                
+                XMPMetadataExtractor.setMaxXMPMMHistory(5);
+                metadata = new Metadata();
+                try (InputStream is = xmpraw.toInputStream()) {
+                    XMPMetadataExtractor.parse(is, metadata);
+                }
+
+                assertEquals(5, 
metadata.getValues(XMPMM.HISTORY_EVENT_INSTANCEID).length);
+            }
+        }
+        finally {
+            //if something goes wrong, make sure to set this back to what it 
was
+            XMPMetadataExtractor.setMaxXMPMMHistory(maxHistory);
+        }
+    }
+
+}

Reply via email to