This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4444
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-4444 by this push:
new a7316a164 TIKA-4449 -- improve metadata key specificity for xml
new 903774b96 Merge remote-tracking branch 'origin/TIKA-4444' into
TIKA-4449
a7316a164 is described below
commit a7316a164b345a70a768e9db71aa441a8fc2e00b
Author: tallison <[email protected]>
AuthorDate: Wed Jul 2 11:33:49 2025 -0400
TIKA-4449 -- improve metadata key specificity for xml
---
.../java/org/apache/tika/metadata/DublinCore.java | 10 ++
.../main/java/org/apache/tika/metadata/XMP.java | 8 +
.../tika/metadata/{DublinCore.java => XMPDC.java} | 17 +-
.../main/java/org/apache/tika/metadata/XMPPDF.java | 42 +++++
tika-parent/pom.xml | 2 +-
.../java/org/apache/tika/parser/pdf/PDFParser.java | 15 +-
.../tika/parser/pdf/PDMetadataExtractor.java | 183 ++++++++++-----------
.../org/apache/tika/parser/pdf/PDFParserTest.java | 56 +++++++
.../resources/test-documents/testPDF-TIKA-4444.pdf | Bin 0 -> 103951 bytes
9 files changed, 221 insertions(+), 112 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
index 23750c35d..a4e32cb8b 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
@@ -20,6 +20,16 @@ package org.apache.tika.metadata;
* A collection of Dublin Core metadata names.
*
* @see <a href="http://dublincore.org">dublincore.org</a>
+ *
+ * <p>
+ * These keys are intended to be a unifying normalization of information
+ * within a file. For some formats, like PDF, where there may be conflicting
+ * information in different parts of the file (xmp vs. docinfo) for the
+ * same metadata key, we do what we can, and these keys represent a
+ * normalization of metadata values within a file.
+ * <p>
+ * For Dublin Core information that derives specifically and only from
+ * XMP, see {@link XMPDC}.
*/
public interface DublinCore {
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMP.java
b/tika-core/src/main/java/org/apache/tika/metadata/XMP.java
index bca38d40b..12842c5f8 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/XMP.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/XMP.java
@@ -16,6 +16,9 @@
*/
package org.apache.tika.metadata;
+/**
+ * Metadata keys for the XMP Basic Schema
+ */
public interface XMP {
String NAMESPACE_URI = "http://ns.adobe.com/xap/1.0/";
@@ -86,4 +89,9 @@ public interface XMP {
*/
Property RATING = Property.externalInteger(PREFIX_ + "Rating");
+ /**
+ * This doesn't belong to the XMP Basic schema. However, because it is
part of
+ * JempBox's XMPBasicSchema, we include this here.
+ */
+ Property TITLE = Property.externalText(PREFIX_ + "Title");
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
b/tika-core/src/main/java/org/apache/tika/metadata/XMPDC.java
similarity index 92%
copy from tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
copy to tika-core/src/main/java/org/apache/tika/metadata/XMPDC.java
index 23750c35d..26f60407f 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPDC.java
@@ -17,16 +17,17 @@
package org.apache.tika.metadata;
/**
- * A collection of Dublin Core metadata names.
- *
- * @see <a href="http://dublincore.org">dublincore.org</a>
+ * Metadata keys for the XMP DublinCore schema. This differs from {@link
DublinCore} in
+ * that this data must derive strictly from XMP. Tika applies logic to
normalize
+ * metadata keys and values into {@link DublinCore}. This process can make it
difficult to determine
+ * if the underlying metadata derived from a literal XMP component or from
another source within the file.
+ * <p>
+ * See <a href="https://issues.apache.org/jira/browse/TIKA-4444">TIKA-4444</a>.
*/
-public interface DublinCore {
+public interface XMPDC {
- String NAMESPACE_URI_DC = "http://purl.org/dc/elements/1.1/";
- String NAMESPACE_URI_DC_TERMS = "http://purl.org/dc/terms/";
- String PREFIX_DC = "dc";
- String PREFIX_DC_TERMS = "dcterms";
+ String PREFIX_DC = "xmp" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"dc";
+ String PREFIX_DC_TERMS = "xmp" +
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "dcterms";
/**
* Typically, Format may include the media-type or dimensions of the
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPPDF.java
b/tika-core/src/main/java/org/apache/tika/metadata/XMPPDF.java
new file mode 100644
index 000000000..a4d1bb13a
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPPDF.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+/**
+ * Metadata keys for the XMP PDF Schema
+ */
+public interface XMPPDF {
+
+
+ String PREFIX = "xmp" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"pdf"
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+
+ /**
+ * Unordered text strings of about.
+ */
+ Property ABOUT = Property.externalTextBag(PREFIX + "About");
+
+ /**
+ * Unordered text strings of keywords.
+ */
+ Property KEY_WORDS = Property.externalTextBag(PREFIX + "Keywords");
+
+ Property PDF_VERSION = Property.externalText(PREFIX + "PDFVersion");
+
+ Property PRODUCER = Property.externalText(PREFIX + "Producer");
+
+}
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index e0ba1333d..18c6fc2e8 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -399,7 +399,7 @@
<jsoup.version>1.21.1</jsoup.version>
<jsr305.version>3.0.2</jsr305.version>
<junit4.version>4.13.2</junit4.version>
- <junit5.version>6.0.0-M1</junit5.version>
+ <junit5.version>5.13.2</junit5.version>
<juniversalchardet.version>2.5.0</juniversalchardet.version>
<junrar.version>7.5.5</junrar.version>
<jwarc.version>0.31.1</jwarc.version>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 0d92ee520..40228bfe4 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -368,21 +368,16 @@ public class PDFParser implements Parser,
RenderingParser, Initializable {
if (signature == null) {
continue;
}
- PDMetadataExtractor.addNotNull(TikaCoreProperties.SIGNATURE_NAME,
signature.getName(),
- metadata);
+ PDMetadataExtractor.addNotNull(signature.getName(), metadata,
TikaCoreProperties.SIGNATURE_NAME);
Calendar date = signature.getSignDate();
if (date != null) {
metadata.add(TikaCoreProperties.SIGNATURE_DATE, date);
}
-
PDMetadataExtractor.addNotNull(TikaCoreProperties.SIGNATURE_CONTACT_INFO,
- signature.getContactInfo(), metadata);
- PDMetadataExtractor.addNotNull(TikaCoreProperties.SIGNATURE_FILTER,
- signature.getFilter(), metadata);
-
PDMetadataExtractor.addNotNull(TikaCoreProperties.SIGNATURE_LOCATION,
- signature.getLocation(), metadata);
- PDMetadataExtractor.addNotNull(TikaCoreProperties.SIGNATURE_REASON,
- signature.getReason(), metadata);
+ PDMetadataExtractor.addNotNull(signature.getContactInfo(),
metadata, TikaCoreProperties.SIGNATURE_CONTACT_INFO);
+ PDMetadataExtractor.addNotNull(signature.getFilter(), metadata,
TikaCoreProperties.SIGNATURE_FILTER);
+ PDMetadataExtractor.addNotNull(signature.getLocation(), metadata,
TikaCoreProperties.SIGNATURE_LOCATION);
+ PDMetadataExtractor.addNotNull(signature.getReason(), metadata,
TikaCoreProperties.SIGNATURE_REASON);
hasSignature = true;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
index 9489fd007..9b497cb8c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
@@ -46,6 +46,8 @@ import org.apache.tika.metadata.PDF;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.XMP;
+import org.apache.tika.metadata.XMPDC;
+import org.apache.tika.metadata.XMPPDF;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.xmpschemas.XMPSchemaIllustrator;
import org.apache.tika.parser.pdf.xmpschemas.XMPSchemaPDFUA;
@@ -113,20 +115,25 @@ public class PDMetadataExtractor {
} catch (IOException e) {
//swallow
}
- if (dcSchema != null) {
- extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION,
null, dcSchema);
- extractDublinCoreListItems(metadata,
TikaCoreProperties.CONTRIBUTOR, dcSchema);
- extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR,
dcSchema);
- extractMultilingualItems(metadata, TikaCoreProperties.TITLE, null,
dcSchema);
- extractDublinCoreListItems(metadata, TikaCoreProperties.TYPE,
dcSchema); // finds only the first one?!
- extractDublinCoreSimpleItem(metadata,
TikaCoreProperties.IDENTIFIER, dcSchema);
- extractDublinCoreListItems(metadata, TikaCoreProperties.LANGUAGE,
dcSchema);
- extractDublinCoreListItems(metadata, TikaCoreProperties.PUBLISHER,
dcSchema);
- extractDublinCoreListItems(metadata, TikaCoreProperties.RELATION,
dcSchema);
- extractDublinCoreSimpleItem(metadata, TikaCoreProperties.SOURCE,
dcSchema);
- extractDublinCoreListItems(metadata, TikaCoreProperties.SUBJECT,
dcSchema);
- extractMultilingualItems(metadata, TikaCoreProperties.RIGHTS,
null, dcSchema);
+ if (dcSchema == null) {
+ return;
}
+ extractDublinCoreSimpleItem(metadata, dcSchema,
TikaCoreProperties.IDENTIFIER.getName(), TikaCoreProperties.IDENTIFIER,
XMPDC.IDENTIFIER);
+ extractDublinCoreSimpleItem(metadata, dcSchema,
TikaCoreProperties.SOURCE.getName(), TikaCoreProperties.SOURCE, XMPDC.SOURCE);
+
+ extractDublinCoreListItems(metadata, dcSchema,
TikaCoreProperties.CONTRIBUTOR.getName(), TikaCoreProperties.CONTRIBUTOR,
XMPDC.CONTRIBUTOR);
+ extractDublinCoreListItems(metadata, dcSchema,
TikaCoreProperties.CREATOR.getName(), TikaCoreProperties.CREATOR,
XMPDC.CREATOR);
+ extractDublinCoreListItems(metadata, dcSchema,
TikaCoreProperties.LANGUAGE.getName(), TikaCoreProperties.LANGUAGE,
XMPDC.LANGUAGE);
+ extractDublinCoreListItems(metadata, dcSchema,
TikaCoreProperties.PUBLISHER.getName(), TikaCoreProperties.PUBLISHER,
XMPDC.PUBLISHER);
+ extractDublinCoreListItems(metadata, dcSchema,
TikaCoreProperties.RELATION.getName(), TikaCoreProperties.RELATION,
XMPDC.RELATION);
+ extractDublinCoreListItems(metadata, dcSchema,
TikaCoreProperties.SUBJECT.getName(), TikaCoreProperties.SUBJECT,
XMPDC.SUBJECT);
+ // finds only the first one?!
+ extractDublinCoreListItems(metadata, dcSchema,
TikaCoreProperties.TYPE.getName(), TikaCoreProperties.TYPE, XMPDC.TYPE);
+
+ extractMultilingualItems(metadata, dcSchema,
TikaCoreProperties.DESCRIPTION.getName(), TikaCoreProperties.DESCRIPTION,
XMPDC.DESCRIPTION);
+ extractMultilingualItems(metadata, dcSchema,
TikaCoreProperties.RIGHTS.getName(), TikaCoreProperties.RIGHTS, XMPDC.RIGHTS);
+ extractMultilingualItems(metadata, dcSchema,
TikaCoreProperties.TITLE.getName(), TikaCoreProperties.TITLE, XMPDC.TITLE);
+
}
private static void extractPDFVT(XMPMetadata xmp, Metadata metadata) {
@@ -261,9 +268,9 @@ public class PDMetadataExtractor {
if (pdf == null) {
return;
}
- setNotNull(PDF.PRODUCER, pdf.getProducer(), metadata);
- setNotNull(Office.KEYWORDS, pdf.getKeywords(), metadata);
- setNotNull(PDF.PDF_VERSION, pdf.getPDFVersion(), metadata);
+ setNotNull(pdf.getProducer(), metadata, PDF.PRODUCER, XMPPDF.PRODUCER);
+ setNotNull(pdf.getKeywords(), metadata, Office.KEYWORDS,
XMPPDF.KEY_WORDS);
+ setNotNull(pdf.getPDFVersion(), metadata, PDF.PDF_VERSION,
XMPPDF.PDF_VERSION);
}
private static void extractBasic(XMPMetadata xmp, Metadata metadata) {
@@ -280,12 +287,11 @@ public class PDMetadataExtractor {
if (basic == null) {
return;
}
- //add the elements from the basic schema if they haven't already
- //been extracted from dublin core
- setNotNull(XMP.CREATOR_TOOL, basic.getCreatorTool(), metadata);
- setNotNull(DublinCore.TITLE, basic.getTitle(), metadata);
- setNotNull(XMP.ABOUT, basic.getAbout(), metadata);
- setNotNull(XMP.LABEL, basic.getLabel(), metadata);
+ //add the elements from the basic schema
+ setNotNull(basic.getCreatorTool(), metadata, XMP.CREATOR_TOOL);
+ setNotNull(basic.getTitle(), metadata, DublinCore.TITLE, XMP.TITLE);
+ setNotNull(basic.getAbout(), metadata, XMP.ABOUT);
+ setNotNull(basic.getLabel(), metadata, XMP.LABEL);
try {
setNotNull(XMP.CREATE_DATE, basic.getCreateDate(), metadata);
} catch (IOException e) {
@@ -314,7 +320,7 @@ public class PDMetadataExtractor {
metadata.add(XMP.ADVISORY, advisory);
}
}
- setNotNull(XMP.NICKNAME, basic.getNickname(), metadata);
+ setNotNull(basic.getNickname(), metadata, XMP.NICKNAME);
try {
setNotNull(XMP.RATING, basic.getRating(), metadata);
} catch (NumberFormatException e) {
@@ -324,9 +330,13 @@ public class PDMetadataExtractor {
//and figure out how to add that info
}
- private static void setNotNull(Property property, String value, Metadata
metadata) {
- if (metadata.get(property) == null && value != null &&
!value.isBlank()) {
- metadata.set(property, decode(value));
+ private static void setNotNull(String value, Metadata metadata, Property
... properties) {
+ if (value == null || value.isBlank()) {
+ return;
+ }
+ String decoded = decode(value);
+ for (Property property : properties) {
+ metadata.set(property, decoded);
}
}
@@ -342,8 +352,11 @@ public class PDMetadataExtractor {
}
}
- static void addNotNull(Property property, String value, Metadata metadata)
{
- if (! StringUtils.isBlank(value)) {
+ static void addNotNull(String value, Metadata metadata, Property ...
properties) {
+ if (StringUtils.isBlank(value)) {
+ return;
+ }
+ for (Property property : properties) {
metadata.add(property, value);
}
}
@@ -375,45 +388,20 @@ public class PDMetadataExtractor {
* values (see TIKA-1295)
*
* @param metadata
- * @param property
- * @param pdfBoxBaseline
- * @param schema
+ * @param schema schema - must be non-null
+ * @param dcName dublin core name for the property to select from the xmp
schema
+ * @param properties property names to set to this value
*/
- private static void extractMultilingualItems(Metadata metadata, Property
property,
- String pdfBoxBaseline,
XMPSchema schema) {
- //if schema is null, just go with pdfBoxBaseline
- if (schema == null) {
- if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
- addMetadata(metadata, property, pdfBoxBaseline);
- }
- return;
- }
-
- for (String lang :
schema.getLanguagePropertyLanguages(property.getName())) {
- String value = schema.getLanguageProperty(property.getName(),
lang);
+ private static void extractMultilingualItems(Metadata metadata, XMPSchema
schema, String dcName, Property ... properties) {
- if (value != null && value.length() > 0) {
- //if you're going to add it below in the baseline addition,
don't add it now
- if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) {
- continue;
- }
- addMetadata(metadata, property, value);
- if (!property.isMultiValuePermitted()) {
- return;
+ for (Property property : properties) {
+ for (String lang : schema.getLanguagePropertyLanguages(dcName)) {
+ String value = schema.getLanguageProperty(dcName, lang);
+ if (value != null && ! value.isBlank()) {
+ addMetadata(metadata, property, value);
}
}
}
-
- if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
- //if we've already added something above and multivalue is not
permitted
- //return.
- if (!property.isMultiValuePermitted()) {
- if (metadata.get(property) != null) {
- return;
- }
- }
- addMetadata(metadata, property, pdfBoxBaseline);
- }
}
@@ -426,22 +414,22 @@ public class PDMetadataExtractor {
* <p/>
* This relies on the property having a DublinCore compliant getName()
*
- * @param property
- * @param dc
* @param metadata
+ * @param dc schema - must be non-null
+ * @param dcName -- name of the dc property to read from the dc schema
+ * @param properties -- property to set for this value in the metadata
object
*/
- private static void extractDublinCoreListItems(Metadata metadata, Property
property,
- XMPSchemaDublinCore dc) {
- //if no dc, add baseline and return
- if (dc == null) {
- return;
- }
- List<String> items = getXMPBagOrSeqList(dc, property.getName());
+ private static void extractDublinCoreListItems(Metadata metadata,
+ XMPSchemaDublinCore dc,
String dcName, Property ... properties) {
+
+ List<String> items = getXMPBagOrSeqList(dc, dcName);
if (items == null) {
return;
}
- for (String item : items) {
- addMetadata(metadata, property, item);
+ for (Property property : properties) {
+ for (String item : items) {
+ addMetadata(metadata, property, item);
+ }
}
}
@@ -450,35 +438,44 @@ public class PDMetadataExtractor {
* <p/>
* This relies on the property having a DublinCore compliant getName()
*
- * @param property
- * @param dc
* @param metadata
+ * @param dc schema - must be non-null
+ * @param dcName -- name of the dc property to read from the dc schema
+ * @param properties -- property to set for this value in the metadata
object
*/
- private static void extractDublinCoreSimpleItem(Metadata metadata,
Property property,
- XMPSchemaDublinCore dc) {
- if (dc == null) {
- return;
+ private static void extractDublinCoreSimpleItem(Metadata metadata,
+ XMPSchemaDublinCore dc,
String dcName, Property ... properties) {
+
+ String textProperty = dc.getTextProperty(dcName);
+ for (Property property : properties) {
+ addMetadata(metadata, property, textProperty);
}
- String textProperty = dc.getTextProperty(property.getName());
- addMetadata(metadata, property, textProperty);
}
+ /**
+ * Add non-null, non-empty and unique values to the Metadata object. If
the property
+ * does not allow multiple values, silently fail to add values after the
first.
+ * @param metadata
+ * @param property
+ * @param value
+ */
static void addMetadata(Metadata metadata, Property property, String
value) {
- if (value != null) {
- String decoded = decode(value);
- if (StringUtils.isBlank(decoded)) {
- return;
- }
- if (property.isMultiValuePermitted() || metadata.get(property) ==
null) {
- for (String v : metadata.getValues(property)) {
- if (v.equals(decoded)) {
- return;
- }
+ if (value == null || value.isBlank()) {
+ return;
+ }
+ String decoded = decode(value);
+ if (StringUtils.isBlank(decoded)) {
+ return;
+ }
+ if (property.isMultiValuePermitted() || metadata.get(property) ==
null) {
+ for (String v : metadata.getValues(property)) {
+ if (v.equals(decoded)) {
+ return;
}
- metadata.add(property, decoded);
}
- //silently skip adding property that already exists if multiple
values are not permitted
+ metadata.add(property, decoded);
}
+ //silently skip adding property that already exists if multiple values
are not permitted
}
static void addMetadata(Metadata metadata, String name, String value) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 0d85d0ed5..99e3d95bb 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -55,10 +55,13 @@ import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.metadata.Font;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PDF;
+import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.TikaPagedText;
import org.apache.tika.metadata.XMP;
+import org.apache.tika.metadata.XMPDC;
import org.apache.tika.metadata.XMPMM;
+import org.apache.tika.metadata.XMPPDF;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
@@ -1472,6 +1475,59 @@ public class PDFParserTest extends TikaTest {
* TODO -- need to test signature extraction
*/
+ @Test
+ public void testMetadataKeyPrecision() throws Exception {
+ //TIKA-4444
+ List<Metadata> metadataList =
getRecursiveMetadata("testPDF-TIKA-4444.pdf");
+ Metadata m = metadataList.get(0);
+
+ assertEquals("xmp-dc-contributor",
m.get(TikaCoreProperties.CONTRIBUTOR));
+ assertEquals("xmp-dc-creator", m.get(TikaCoreProperties.CREATOR));
+ assertEquals("xmp-dc-description",
m.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("application/pdf; version=1.3",
m.get(TikaCoreProperties.FORMAT));
+ assertEquals("xmp-dc-identifier",
m.get(TikaCoreProperties.IDENTIFIER));
+ assertEquals("xmp-dc-language", m.get(TikaCoreProperties.LANGUAGE));
+ assertEquals("xmp-dc-publisher", m.get(TikaCoreProperties.PUBLISHER));
+ assertEquals("xmp-dc-relation", m.get(TikaCoreProperties.RELATION));
+ assertEquals("xmp-dc-rights", m.get(TikaCoreProperties.RIGHTS));
+ assertEquals("xmp-dc-source", m.get(TikaCoreProperties.SOURCE));
+ assertEquals("xmp-dc-title", m.get(TikaCoreProperties.TITLE));
+ assertEquals("xmp-dc-type", m.get(TikaCoreProperties.TYPE));
+ assertEquals("pdf-author", m.get(PDF.DOC_INFO_CREATOR));
+ assertEquals("pdf-creator", m.get(PDF.DOC_INFO_CREATOR_TOOL));
+ assertEquals("pdf-keywords", m.get(PDF.DOC_INFO_KEY_WORDS));
+
assertTrue(m.get(PDF.DOC_INFO_MODIFICATION_DATE).startsWith("2025-06"));
+ assertEquals("pypdf-5.6.1", m.get(PDF.DOC_INFO_PRODUCER));
+ assertEquals("pdf-subject", m.get(PDF.DOC_INFO_SUBJECT));
+ assertEquals("pdf-title", m.get(PDF.DOC_INFO_TITLE));
+ assertTrue(m.get(XMP.CREATE_DATE).startsWith("2025-02"));
+ assertEquals("xmp-xmp-creator-tool", m.get(XMP.CREATOR_TOOL));
+ assertTrue(m.get(XMP.METADATA_DATE).startsWith("2025-02"));
+ assertTrue(m.get(XMP.MODIFY_DATE).startsWith("2025-02"));
+ assertEquals("xmp-dc-contributor", m.get(XMPDC.CONTRIBUTOR));
+ assertEquals("xmp-dc-creator", m.get(XMPDC.CREATOR));
+ assertEquals("xmp-dc-description", m.get(XMPDC.DESCRIPTION));
+ assertEquals("xmp-dc-identifier", m.get(XMPDC.IDENTIFIER));
+ assertEquals("xmp-dc-language", m.get(XMPDC.LANGUAGE));
+ assertEquals("xmp-dc-publisher", m.get(XMPDC.PUBLISHER));
+ assertEquals("xmp-dc-relation", m.get(XMPDC.RELATION));
+ assertEquals("xmp-dc-rights", m.get(XMPDC.RIGHTS));
+ assertEquals("xmp-dc-source", m.get(XMPDC.SOURCE));
+ assertEquals("xmp-dc-subject", m.get(XMPDC.SUBJECT));
+ assertEquals("xmp-dc-title", m.get(XMPDC.TITLE));
+ assertEquals("xmp-dc-type", m.get(XMPDC.TYPE));
+ assertEquals("xmp-pdf-keywords", m.get(XMPPDF.KEY_WORDS));
+ assertEquals("xmp-pdf-version", m.get(XMPPDF.PDF_VERSION));
+ assertEquals("xmp-pdf-producer", m.get(XMPPDF.PRODUCER));
+ assertEquals("xmp-xmpmm-documentid", m.get(XMPMM.DOCUMENTID));
+ assertEquals("13", m.get(PagedText.N_PAGES));
+
+ String[] expectedSubjectVals = new String[]{
+ "xmp-pdf-keywords", "xmp-dc-subject", "pdf-keywords",
"pdf-subject"
+ };
+ assertArrayEquals(expectedSubjectVals,
m.getValues(TikaCoreProperties.SUBJECT));
+ }
+
/**
@Test
public void testWriteLimit() throws Exception {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF-TIKA-4444.pdf
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF-TIKA-4444.pdf
new file mode 100644
index 000000000..b24ef757f
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF-TIKA-4444.pdf
differ