This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_3x by this push:
     new 5a64d7c3b TIKA-4449 (#2266)
5a64d7c3b is described below

commit 5a64d7c3b803f076cb2ae0e9f71334b8b0ae1083
Author: Tim Allison <talli...@apache.org>
AuthorDate: Tue Jul 8 07:51:58 2025 -0400

    TIKA-4449 (#2266)
    
    * TIKA-4444 -- improve precision of metadata keys for xmp-derived metadata
    
    (cherry picked from commit 77a39c4f25e0e0f23cf6662bbad3a24b8dc94f65)
---
 .../java/org/apache/tika/metadata/DublinCore.java  |  10 ++
 .../main/java/org/apache/tika/metadata/XMP.java    |   8 +
 .../tika/metadata/{DublinCore.java => XMPDC.java}  |  17 +-
 .../main/java/org/apache/tika/metadata/XMPPDF.java |  42 +++++
 .../java/org/apache/tika/parser/pdf/PDFParser.java |  15 +-
 .../tika/parser/pdf/PDMetadataExtractor.java       | 183 ++++++++++-----------
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  56 +++++++
 .../resources/test-documents/testPDF-TIKA-4444.pdf | Bin 0 -> 103951 bytes
 8 files changed, 220 insertions(+), 111 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java 
b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
index 23750c35d..a4e32cb8b 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
@@ -20,6 +20,16 @@ package org.apache.tika.metadata;
  * A collection of Dublin Core metadata names.
  *
  * @see <a href="http://dublincore.org";>dublincore.org</a>
+ *
+ * <p>
+ * These keys are intended to be a unifying normalization of information
+ * within a file. For some formats, like PDF, where there may be conflicting
+ * information in different parts of the file (xmp vs. docinfo) for the
+ * same metadata key, we do what we can, and these keys represent a
+ * normalization of metadata values within a file.
+ * <p>
+ * For Dublin Core information that derives specifically and only from
+ * XMP, see {@link XMPDC}.
  */
 public interface DublinCore {
 
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMP.java 
b/tika-core/src/main/java/org/apache/tika/metadata/XMP.java
index bca38d40b..12842c5f8 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/XMP.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/XMP.java
@@ -16,6 +16,9 @@
  */
 package org.apache.tika.metadata;
 
+/**
+ * Metadata keys for the XMP Basic Schema
+ */
 public interface XMP {
 
     String NAMESPACE_URI = "http://ns.adobe.com/xap/1.0/";;
@@ -86,4 +89,9 @@ public interface XMP {
      */
     Property RATING = Property.externalInteger(PREFIX_ + "Rating");
 
+    /**
+     * This doesn't belong to the XMP Basic schema. However, because it is 
part of
+     * JempBox's XMPBasicSchema, we include this here.
+     */
+    Property TITLE = Property.externalText(PREFIX_ + "Title");
 }
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java 
b/tika-core/src/main/java/org/apache/tika/metadata/XMPDC.java
similarity index 92%
copy from tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
copy to tika-core/src/main/java/org/apache/tika/metadata/XMPDC.java
index 23750c35d..26f60407f 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPDC.java
@@ -17,16 +17,17 @@
 package org.apache.tika.metadata;
 
 /**
- * A collection of Dublin Core metadata names.
- *
- * @see <a href="http://dublincore.org";>dublincore.org</a>
+ * Metadata keys for the XMP DublinCore schema. This differs from {@link 
DublinCore} in
+ * that this data must derive strictly from XMP. Tika applies logic to 
normalize
+ * metadata keys and values into {@link DublinCore}. This process can make it 
difficult to determine
+ * if the underlying metadata derived from a literal XMP component or from 
another source within the file.
+ * <p>
+ * See <a href="https://issues.apache.org/jira/browse/TIKA-4444";>TIKA-4444</a>.
  */
-public interface DublinCore {
+public interface XMPDC {
 
-    String NAMESPACE_URI_DC = "http://purl.org/dc/elements/1.1/";;
-    String NAMESPACE_URI_DC_TERMS = "http://purl.org/dc/terms/";;
-    String PREFIX_DC = "dc";
-    String PREFIX_DC_TERMS = "dcterms";
+    String PREFIX_DC = "xmp" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"dc";
+    String PREFIX_DC_TERMS = "xmp" + 
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "dcterms";
 
     /**
      * Typically, Format may include the media-type or dimensions of the
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPPDF.java 
b/tika-core/src/main/java/org/apache/tika/metadata/XMPPDF.java
new file mode 100644
index 000000000..a4d1bb13a
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPPDF.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+/**
+ * Metadata keys for the XMP PDF Schema
+ */
+public interface XMPPDF {
+
+
+    String PREFIX = "xmp" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"pdf"
+            + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+
+    /**
+     * Unordered text strings of about.
+     */
+    Property ABOUT = Property.externalTextBag(PREFIX + "About");
+
+    /**
+     * Unordered text strings of keywords.
+     */
+    Property KEY_WORDS = Property.externalTextBag(PREFIX + "Keywords");
+
+    Property PDF_VERSION = Property.externalText(PREFIX + "PDFVersion");
+
+    Property PRODUCER = Property.externalText(PREFIX + "Producer");
+
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 13d09e604..24996caac 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -368,21 +368,16 @@ public class PDFParser implements Parser, 
RenderingParser, Initializable {
             if (signature == null) {
                 continue;
             }
-            PDMetadataExtractor.addNotNull(TikaCoreProperties.SIGNATURE_NAME, 
signature.getName(),
-                    metadata);
+            PDMetadataExtractor.addNotNull(signature.getName(), metadata, 
TikaCoreProperties.SIGNATURE_NAME);
 
             Calendar date = signature.getSignDate();
             if (date != null) {
                 metadata.add(TikaCoreProperties.SIGNATURE_DATE, date);
             }
-            
PDMetadataExtractor.addNotNull(TikaCoreProperties.SIGNATURE_CONTACT_INFO,
-                    signature.getContactInfo(), metadata);
-            PDMetadataExtractor.addNotNull(TikaCoreProperties.SIGNATURE_FILTER,
-                    signature.getFilter(), metadata);
-            
PDMetadataExtractor.addNotNull(TikaCoreProperties.SIGNATURE_LOCATION,
-                    signature.getLocation(), metadata);
-            PDMetadataExtractor.addNotNull(TikaCoreProperties.SIGNATURE_REASON,
-                    signature.getReason(), metadata);
+            PDMetadataExtractor.addNotNull(signature.getContactInfo(), 
metadata, TikaCoreProperties.SIGNATURE_CONTACT_INFO);
+            PDMetadataExtractor.addNotNull(signature.getFilter(), metadata, 
TikaCoreProperties.SIGNATURE_FILTER);
+            PDMetadataExtractor.addNotNull(signature.getLocation(), metadata, 
TikaCoreProperties.SIGNATURE_LOCATION);
+            PDMetadataExtractor.addNotNull(signature.getReason(), metadata, 
TikaCoreProperties.SIGNATURE_REASON);
             hasSignature = true;
 
         }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
index 9489fd007..9b497cb8c 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
@@ -46,6 +46,8 @@ import org.apache.tika.metadata.PDF;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.XMP;
+import org.apache.tika.metadata.XMPDC;
+import org.apache.tika.metadata.XMPPDF;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.pdf.xmpschemas.XMPSchemaIllustrator;
 import org.apache.tika.parser.pdf.xmpschemas.XMPSchemaPDFUA;
@@ -113,20 +115,25 @@ public class PDMetadataExtractor {
         } catch (IOException e) {
             //swallow
         }
-        if (dcSchema != null) {
-            extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, 
null, dcSchema);
-            extractDublinCoreListItems(metadata, 
TikaCoreProperties.CONTRIBUTOR, dcSchema);
-            extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, 
dcSchema);
-            extractMultilingualItems(metadata, TikaCoreProperties.TITLE, null, 
dcSchema);
-            extractDublinCoreListItems(metadata, TikaCoreProperties.TYPE, 
dcSchema); // finds only the first one?!
-            extractDublinCoreSimpleItem(metadata, 
TikaCoreProperties.IDENTIFIER, dcSchema);
-            extractDublinCoreListItems(metadata, TikaCoreProperties.LANGUAGE, 
dcSchema);
-            extractDublinCoreListItems(metadata, TikaCoreProperties.PUBLISHER, 
dcSchema);
-            extractDublinCoreListItems(metadata, TikaCoreProperties.RELATION, 
dcSchema);
-            extractDublinCoreSimpleItem(metadata, TikaCoreProperties.SOURCE, 
dcSchema);
-            extractDublinCoreListItems(metadata, TikaCoreProperties.SUBJECT, 
dcSchema);
-            extractMultilingualItems(metadata, TikaCoreProperties.RIGHTS, 
null, dcSchema);
+        if (dcSchema == null) {
+            return;
         }
+        extractDublinCoreSimpleItem(metadata, dcSchema, 
TikaCoreProperties.IDENTIFIER.getName(), TikaCoreProperties.IDENTIFIER, 
XMPDC.IDENTIFIER);
+        extractDublinCoreSimpleItem(metadata, dcSchema, 
TikaCoreProperties.SOURCE.getName(), TikaCoreProperties.SOURCE, XMPDC.SOURCE);
+
+        extractDublinCoreListItems(metadata, dcSchema, 
TikaCoreProperties.CONTRIBUTOR.getName(), TikaCoreProperties.CONTRIBUTOR, 
XMPDC.CONTRIBUTOR);
+        extractDublinCoreListItems(metadata, dcSchema, 
TikaCoreProperties.CREATOR.getName(), TikaCoreProperties.CREATOR, 
XMPDC.CREATOR);
+        extractDublinCoreListItems(metadata, dcSchema, 
TikaCoreProperties.LANGUAGE.getName(), TikaCoreProperties.LANGUAGE, 
XMPDC.LANGUAGE);
+        extractDublinCoreListItems(metadata, dcSchema, 
TikaCoreProperties.PUBLISHER.getName(), TikaCoreProperties.PUBLISHER, 
XMPDC.PUBLISHER);
+        extractDublinCoreListItems(metadata, dcSchema, 
TikaCoreProperties.RELATION.getName(), TikaCoreProperties.RELATION, 
XMPDC.RELATION);
+        extractDublinCoreListItems(metadata, dcSchema, 
TikaCoreProperties.SUBJECT.getName(), TikaCoreProperties.SUBJECT, 
XMPDC.SUBJECT);
+        // finds only the first one?!
+        extractDublinCoreListItems(metadata, dcSchema, 
TikaCoreProperties.TYPE.getName(), TikaCoreProperties.TYPE, XMPDC.TYPE);
+
+        extractMultilingualItems(metadata, dcSchema, 
TikaCoreProperties.DESCRIPTION.getName(), TikaCoreProperties.DESCRIPTION, 
XMPDC.DESCRIPTION);
+        extractMultilingualItems(metadata, dcSchema, 
TikaCoreProperties.RIGHTS.getName(), TikaCoreProperties.RIGHTS, XMPDC.RIGHTS);
+        extractMultilingualItems(metadata, dcSchema, 
TikaCoreProperties.TITLE.getName(), TikaCoreProperties.TITLE, XMPDC.TITLE);
+
     }
 
     private static void extractPDFVT(XMPMetadata xmp, Metadata metadata) {
@@ -261,9 +268,9 @@ public class PDMetadataExtractor {
         if (pdf == null) {
             return;
         }
-        setNotNull(PDF.PRODUCER, pdf.getProducer(), metadata);
-        setNotNull(Office.KEYWORDS, pdf.getKeywords(), metadata);
-        setNotNull(PDF.PDF_VERSION, pdf.getPDFVersion(), metadata);
+        setNotNull(pdf.getProducer(), metadata, PDF.PRODUCER, XMPPDF.PRODUCER);
+        setNotNull(pdf.getKeywords(), metadata, Office.KEYWORDS, 
XMPPDF.KEY_WORDS);
+        setNotNull(pdf.getPDFVersion(), metadata, PDF.PDF_VERSION, 
XMPPDF.PDF_VERSION);
     }
 
     private static void extractBasic(XMPMetadata xmp, Metadata metadata) {
@@ -280,12 +287,11 @@ public class PDMetadataExtractor {
         if (basic == null) {
             return;
         }
-        //add the elements from the basic schema if they haven't already
-        //been extracted from dublin core
-        setNotNull(XMP.CREATOR_TOOL, basic.getCreatorTool(), metadata);
-        setNotNull(DublinCore.TITLE, basic.getTitle(), metadata);
-        setNotNull(XMP.ABOUT, basic.getAbout(), metadata);
-        setNotNull(XMP.LABEL, basic.getLabel(), metadata);
+        //add the elements from the basic schema
+        setNotNull(basic.getCreatorTool(), metadata, XMP.CREATOR_TOOL);
+        setNotNull(basic.getTitle(), metadata, DublinCore.TITLE, XMP.TITLE);
+        setNotNull(basic.getAbout(), metadata, XMP.ABOUT);
+        setNotNull(basic.getLabel(), metadata, XMP.LABEL);
         try {
             setNotNull(XMP.CREATE_DATE, basic.getCreateDate(), metadata);
         } catch (IOException e) {
@@ -314,7 +320,7 @@ public class PDMetadataExtractor {
                 metadata.add(XMP.ADVISORY, advisory);
             }
         }
-        setNotNull(XMP.NICKNAME, basic.getNickname(), metadata);
+        setNotNull(basic.getNickname(), metadata, XMP.NICKNAME);
         try {
             setNotNull(XMP.RATING, basic.getRating(), metadata);
         } catch (NumberFormatException e) {
@@ -324,9 +330,13 @@ public class PDMetadataExtractor {
         //and figure out how to add that info
     }
 
-    private static void setNotNull(Property property, String value, Metadata 
metadata) {
-        if (metadata.get(property) == null && value != null && 
!value.isBlank()) {
-            metadata.set(property, decode(value));
+    private static void setNotNull(String value, Metadata metadata, Property 
... properties) {
+        if (value == null || value.isBlank()) {
+            return;
+        }
+        String decoded = decode(value);
+        for (Property property : properties) {
+            metadata.set(property, decoded);
         }
     }
 
@@ -342,8 +352,11 @@ public class PDMetadataExtractor {
         }
     }
 
-    static void addNotNull(Property property, String value, Metadata metadata) 
{
-        if (! StringUtils.isBlank(value)) {
+    static void addNotNull(String value, Metadata metadata, Property ... 
properties) {
+        if (StringUtils.isBlank(value)) {
+            return;
+        }
+        for (Property property : properties) {
             metadata.add(property, value);
         }
     }
@@ -375,45 +388,20 @@ public class PDMetadataExtractor {
      * values (see TIKA-1295)
      *
      * @param metadata
-     * @param property
-     * @param pdfBoxBaseline
-     * @param schema
+     * @param schema schema - must be non-null
+     * @param dcName dublin core name for the property to select from the xmp 
schema
+     * @param properties property names to set to this value
      */
-    private static void extractMultilingualItems(Metadata metadata, Property 
property,
-                                                 String pdfBoxBaseline, 
XMPSchema schema) {
-        //if schema is null, just go with pdfBoxBaseline
-        if (schema == null) {
-            if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
-                addMetadata(metadata, property, pdfBoxBaseline);
-            }
-            return;
-        }
-
-        for (String lang : 
schema.getLanguagePropertyLanguages(property.getName())) {
-            String value = schema.getLanguageProperty(property.getName(), 
lang);
+    private static void extractMultilingualItems(Metadata metadata, XMPSchema 
schema, String dcName, Property ... properties) {
 
-            if (value != null && value.length() > 0) {
-                //if you're going to add it below in the baseline addition, 
don't add it now
-                if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) {
-                    continue;
-                }
-                addMetadata(metadata, property, value);
-                if (!property.isMultiValuePermitted()) {
-                    return;
+        for (Property property : properties) {
+            for (String lang : schema.getLanguagePropertyLanguages(dcName)) {
+                String value = schema.getLanguageProperty(dcName, lang);
+                if (value != null && ! value.isBlank()) {
+                    addMetadata(metadata, property, value);
                 }
             }
         }
-
-        if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
-            //if we've already added something above and multivalue is not 
permitted
-            //return.
-            if (!property.isMultiValuePermitted()) {
-                if (metadata.get(property) != null) {
-                    return;
-                }
-            }
-            addMetadata(metadata, property, pdfBoxBaseline);
-        }
     }
 
 
@@ -426,22 +414,22 @@ public class PDMetadataExtractor {
      * <p/>
      * This relies on the property having a DublinCore compliant getName()
      *
-     * @param property
-     * @param dc
      * @param metadata
+     * @param dc schema - must be non-null
+     * @param dcName -- name of the dc property to read from the dc schema
+     * @param properties -- property to set for this value in the metadata 
object
      */
-    private static void extractDublinCoreListItems(Metadata metadata, Property 
property,
-                                                   XMPSchemaDublinCore dc) {
-        //if no dc, add baseline and return
-        if (dc == null) {
-            return;
-        }
-        List<String> items = getXMPBagOrSeqList(dc, property.getName());
+    private static void extractDublinCoreListItems(Metadata metadata,
+                                                   XMPSchemaDublinCore dc, 
String dcName, Property ... properties) {
+
+        List<String> items = getXMPBagOrSeqList(dc, dcName);
         if (items == null) {
             return;
         }
-        for (String item : items) {
-            addMetadata(metadata, property, item);
+        for (Property property : properties) {
+            for (String item : items) {
+                addMetadata(metadata, property, item);
+            }
         }
     }
 
@@ -450,35 +438,44 @@ public class PDMetadataExtractor {
      * <p/>
      * This relies on the property having a DublinCore compliant getName()
      *
-     * @param property
-     * @param dc
      * @param metadata
+     * @param dc schema - must be non-null
+     * @param dcName -- name of the dc property to read from the dc schema
+     * @param properties -- property to set for this value in the metadata 
object
      */
-    private static void extractDublinCoreSimpleItem(Metadata metadata, 
Property property,
-                                                   XMPSchemaDublinCore dc) {
-        if (dc == null) {
-            return;
+    private static void extractDublinCoreSimpleItem(Metadata metadata,
+                                                   XMPSchemaDublinCore dc, 
String dcName, Property ... properties) {
+
+        String textProperty = dc.getTextProperty(dcName);
+        for (Property property : properties) {
+            addMetadata(metadata, property, textProperty);
         }
-        String textProperty = dc.getTextProperty(property.getName());
-        addMetadata(metadata, property, textProperty);
     }
 
+    /**
+     * Add non-null, non-empty and unique values to the Metadata object. If 
the property
+     * does not allow multiple values, silently fail to add values after the 
first.
+     * @param metadata
+     * @param property
+     * @param value
+     */
     static void addMetadata(Metadata metadata, Property property, String 
value) {
-        if (value != null) {
-            String decoded = decode(value);
-            if (StringUtils.isBlank(decoded)) {
-                return;
-            }
-            if (property.isMultiValuePermitted() || metadata.get(property) == 
null) {
-                for (String v : metadata.getValues(property)) {
-                    if (v.equals(decoded)) {
-                        return;
-                    }
+        if (value == null || value.isBlank()) {
+            return;
+        }
+        String decoded = decode(value);
+        if (StringUtils.isBlank(decoded)) {
+            return;
+        }
+        if (property.isMultiValuePermitted() || metadata.get(property) == 
null) {
+            for (String v : metadata.getValues(property)) {
+                if (v.equals(decoded)) {
+                    return;
                 }
-                metadata.add(property, decoded);
             }
-            //silently skip adding property that already exists if multiple 
values are not permitted
+            metadata.add(property, decoded);
         }
+        //silently skip adding property that already exists if multiple values 
are not permitted
     }
 
     static void addMetadata(Metadata metadata, String name, String value) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index d3f4f9f28..823e7bbba 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -55,10 +55,13 @@ import org.apache.tika.extractor.DocumentSelector;
 import org.apache.tika.metadata.Font;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.PDF;
+import org.apache.tika.metadata.PagedText;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.TikaPagedText;
 import org.apache.tika.metadata.XMP;
+import org.apache.tika.metadata.XMPDC;
 import org.apache.tika.metadata.XMPMM;
+import org.apache.tika.metadata.XMPPDF;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.CompositeParser;
@@ -1472,6 +1475,59 @@ public class PDFParserTest extends TikaTest {
      * TODO -- need to test signature extraction
      */
 
+    @Test
+    public void testMetadataKeyPrecision() throws Exception {
+        //TIKA-4444
+        List<Metadata> metadataList = 
getRecursiveMetadata("testPDF-TIKA-4444.pdf");
+        Metadata m = metadataList.get(0);
+
+        assertEquals("xmp-dc-contributor", 
m.get(TikaCoreProperties.CONTRIBUTOR));
+        assertEquals("xmp-dc-creator", m.get(TikaCoreProperties.CREATOR));
+        assertEquals("xmp-dc-description", 
m.get(TikaCoreProperties.DESCRIPTION));
+        assertEquals("application/pdf; version=1.3", 
m.get(TikaCoreProperties.FORMAT));
+        assertEquals("xmp-dc-identifier", 
m.get(TikaCoreProperties.IDENTIFIER));
+        assertEquals("xmp-dc-language", m.get(TikaCoreProperties.LANGUAGE));
+        assertEquals("xmp-dc-publisher", m.get(TikaCoreProperties.PUBLISHER));
+        assertEquals("xmp-dc-relation", m.get(TikaCoreProperties.RELATION));
+        assertEquals("xmp-dc-rights", m.get(TikaCoreProperties.RIGHTS));
+        assertEquals("xmp-dc-source", m.get(TikaCoreProperties.SOURCE));
+        assertEquals("xmp-dc-title", m.get(TikaCoreProperties.TITLE));
+        assertEquals("xmp-dc-type", m.get(TikaCoreProperties.TYPE));
+        assertEquals("pdf-author", m.get(PDF.DOC_INFO_CREATOR));
+        assertEquals("pdf-creator", m.get(PDF.DOC_INFO_CREATOR_TOOL));
+        assertEquals("pdf-keywords", m.get(PDF.DOC_INFO_KEY_WORDS));
+        
assertTrue(m.get(PDF.DOC_INFO_MODIFICATION_DATE).startsWith("2025-06"));
+        assertEquals("pypdf-5.6.1", m.get(PDF.DOC_INFO_PRODUCER));
+        assertEquals("pdf-subject", m.get(PDF.DOC_INFO_SUBJECT));
+        assertEquals("pdf-title", m.get(PDF.DOC_INFO_TITLE));
+        assertTrue(m.get(XMP.CREATE_DATE).startsWith("2025-02"));
+        assertEquals("xmp-xmp-creator-tool", m.get(XMP.CREATOR_TOOL));
+        assertTrue(m.get(XMP.METADATA_DATE).startsWith("2025-02"));
+        assertTrue(m.get(XMP.MODIFY_DATE).startsWith("2025-02"));
+        assertEquals("xmp-dc-contributor", m.get(XMPDC.CONTRIBUTOR));
+        assertEquals("xmp-dc-creator", m.get(XMPDC.CREATOR));
+        assertEquals("xmp-dc-description", m.get(XMPDC.DESCRIPTION));
+        assertEquals("xmp-dc-identifier", m.get(XMPDC.IDENTIFIER));
+        assertEquals("xmp-dc-language", m.get(XMPDC.LANGUAGE));
+        assertEquals("xmp-dc-publisher", m.get(XMPDC.PUBLISHER));
+        assertEquals("xmp-dc-relation", m.get(XMPDC.RELATION));
+        assertEquals("xmp-dc-rights", m.get(XMPDC.RIGHTS));
+        assertEquals("xmp-dc-source", m.get(XMPDC.SOURCE));
+        assertEquals("xmp-dc-subject", m.get(XMPDC.SUBJECT));
+        assertEquals("xmp-dc-title", m.get(XMPDC.TITLE));
+        assertEquals("xmp-dc-type", m.get(XMPDC.TYPE));
+        assertEquals("xmp-pdf-keywords", m.get(XMPPDF.KEY_WORDS));
+        assertEquals("xmp-pdf-version", m.get(XMPPDF.PDF_VERSION));
+        assertEquals("xmp-pdf-producer", m.get(XMPPDF.PRODUCER));
+        assertEquals("xmp-xmpmm-documentid", m.get(XMPMM.DOCUMENTID));
+        assertEquals("13", m.get(PagedText.N_PAGES));
+
+        String[] expectedSubjectVals = new String[]{
+                "xmp-pdf-keywords", "xmp-dc-subject", "pdf-keywords", 
"pdf-subject"
+        };
+        assertArrayEquals(expectedSubjectVals, 
m.getValues(TikaCoreProperties.SUBJECT));
+    }
+
     /**
     @Test
     public void testWriteLimit() throws Exception {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF-TIKA-4444.pdf
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF-TIKA-4444.pdf
new file mode 100644
index 000000000..b24ef757f
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF-TIKA-4444.pdf
 differ

Reply via email to