This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_3x in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push: new 5a64d7c3b TIKA-4449 (#2266) 5a64d7c3b is described below commit 5a64d7c3b803f076cb2ae0e9f71334b8b0ae1083 Author: Tim Allison <talli...@apache.org> AuthorDate: Tue Jul 8 07:51:58 2025 -0400 TIKA-4449 (#2266) * TIKA-4444 -- improve precision of metadata keys for xmp-derived metadata (cherry picked from commit 77a39c4f25e0e0f23cf6662bbad3a24b8dc94f65) --- .../java/org/apache/tika/metadata/DublinCore.java | 10 ++ .../main/java/org/apache/tika/metadata/XMP.java | 8 + .../tika/metadata/{DublinCore.java => XMPDC.java} | 17 +- .../main/java/org/apache/tika/metadata/XMPPDF.java | 42 +++++ .../java/org/apache/tika/parser/pdf/PDFParser.java | 15 +- .../tika/parser/pdf/PDMetadataExtractor.java | 183 ++++++++++----------- .../org/apache/tika/parser/pdf/PDFParserTest.java | 56 +++++++ .../resources/test-documents/testPDF-TIKA-4444.pdf | Bin 0 -> 103951 bytes 8 files changed, 220 insertions(+), 111 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java index 23750c35d..a4e32cb8b 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java @@ -20,6 +20,16 @@ package org.apache.tika.metadata; * A collection of Dublin Core metadata names. * * @see <a href="http://dublincore.org">dublincore.org</a> + * + * <p> + * These keys are intended to be a unifying normalization of information + * within a file. For some formats, like PDF, where there may be conflicting + * information in different parts of the file (xmp vs. docinfo) for the + * same metadata key, we do what we can, and these keys represent a + * normalization of metadata values within a file. + * <p> + * For Dublin Core information that derives specifically and only from + * XMP, see {@link XMPDC}. */ public interface DublinCore { diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMP.java b/tika-core/src/main/java/org/apache/tika/metadata/XMP.java index bca38d40b..12842c5f8 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/XMP.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/XMP.java @@ -16,6 +16,9 @@ */ package org.apache.tika.metadata; +/** + * Metadata keys for the XMP Basic Schema + */ public interface XMP { String NAMESPACE_URI = "http://ns.adobe.com/xap/1.0/"; @@ -86,4 +89,9 @@ public interface XMP { */ Property RATING = Property.externalInteger(PREFIX_ + "Rating"); + /** + * This doesn't belong to the XMP Basic schema. However, because it is part of + * JempBox's XMPBasicSchema, we include this here. + */ + Property TITLE = Property.externalText(PREFIX_ + "Title"); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPDC.java similarity index 92% copy from tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java copy to tika-core/src/main/java/org/apache/tika/metadata/XMPDC.java index 23750c35d..26f60407f 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPDC.java @@ -17,16 +17,17 @@ package org.apache.tika.metadata; /** - * A collection of Dublin Core metadata names. - * - * @see <a href="http://dublincore.org">dublincore.org</a> + * Metadata keys for the XMP DublinCore schema. This differs from {@link DublinCore} in + * that this data must derive strictly from XMP. Tika applies logic to normalize + * metadata keys and values into {@link DublinCore}. This process can make it difficult to determine + * if the underlying metadata derived from a literal XMP component or from another source within the file. + * <p> + * See <a href="https://issues.apache.org/jira/browse/TIKA-4444">TIKA-4444</a>. */ -public interface DublinCore { +public interface XMPDC { - String NAMESPACE_URI_DC = "http://purl.org/dc/elements/1.1/"; - String NAMESPACE_URI_DC_TERMS = "http://purl.org/dc/terms/"; - String PREFIX_DC = "dc"; - String PREFIX_DC_TERMS = "dcterms"; + String PREFIX_DC = "xmp" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "dc"; + String PREFIX_DC_TERMS = "xmp" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "dcterms"; /** * Typically, Format may include the media-type or dimensions of the diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPPDF.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPPDF.java new file mode 100644 index 000000000..a4d1bb13a --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPPDF.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata; + +/** + * Metadata keys for the XMP PDF Schema + */ +public interface XMPPDF { + + + String PREFIX = "xmp" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "pdf" + + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; + + /** + * Unordered text strings of about. + */ + Property ABOUT = Property.externalTextBag(PREFIX + "About"); + + /** + * Unordered text strings of keywords. + */ + Property KEY_WORDS = Property.externalTextBag(PREFIX + "Keywords"); + + Property PDF_VERSION = Property.externalText(PREFIX + "PDFVersion"); + + Property PRODUCER = Property.externalText(PREFIX + "Producer"); + +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 13d09e604..24996caac 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -368,21 +368,16 @@ public class PDFParser implements Parser, RenderingParser, Initializable { if (signature == null) { continue; } - PDMetadataExtractor.addNotNull(TikaCoreProperties.SIGNATURE_NAME, signature.getName(), - metadata); + PDMetadataExtractor.addNotNull(signature.getName(), metadata, TikaCoreProperties.SIGNATURE_NAME); Calendar date = signature.getSignDate(); if (date != null) { metadata.add(TikaCoreProperties.SIGNATURE_DATE, date); } - PDMetadataExtractor.addNotNull(TikaCoreProperties.SIGNATURE_CONTACT_INFO, - signature.getContactInfo(), metadata); - PDMetadataExtractor.addNotNull(TikaCoreProperties.SIGNATURE_FILTER, - signature.getFilter(), metadata); - PDMetadataExtractor.addNotNull(TikaCoreProperties.SIGNATURE_LOCATION, - signature.getLocation(), metadata); - PDMetadataExtractor.addNotNull(TikaCoreProperties.SIGNATURE_REASON, - signature.getReason(), metadata); + PDMetadataExtractor.addNotNull(signature.getContactInfo(), metadata, TikaCoreProperties.SIGNATURE_CONTACT_INFO); + PDMetadataExtractor.addNotNull(signature.getFilter(), metadata, TikaCoreProperties.SIGNATURE_FILTER); + PDMetadataExtractor.addNotNull(signature.getLocation(), metadata, TikaCoreProperties.SIGNATURE_LOCATION); + PDMetadataExtractor.addNotNull(signature.getReason(), metadata, TikaCoreProperties.SIGNATURE_REASON); hasSignature = true; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java index 9489fd007..9b497cb8c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java @@ -46,6 +46,8 @@ import org.apache.tika.metadata.PDF; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.XMP; +import org.apache.tika.metadata.XMPDC; +import org.apache.tika.metadata.XMPPDF; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.pdf.xmpschemas.XMPSchemaIllustrator; import org.apache.tika.parser.pdf.xmpschemas.XMPSchemaPDFUA; @@ -113,20 +115,25 @@ public class PDMetadataExtractor { } catch (IOException e) { //swallow } - if (dcSchema != null) { - extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); - extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, dcSchema); - extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, dcSchema); - extractMultilingualItems(metadata, TikaCoreProperties.TITLE, null, dcSchema); - extractDublinCoreListItems(metadata, TikaCoreProperties.TYPE, dcSchema); // finds only the first one?! - extractDublinCoreSimpleItem(metadata, TikaCoreProperties.IDENTIFIER, dcSchema); - extractDublinCoreListItems(metadata, TikaCoreProperties.LANGUAGE, dcSchema); - extractDublinCoreListItems(metadata, TikaCoreProperties.PUBLISHER, dcSchema); - extractDublinCoreListItems(metadata, TikaCoreProperties.RELATION, dcSchema); - extractDublinCoreSimpleItem(metadata, TikaCoreProperties.SOURCE, dcSchema); - extractDublinCoreListItems(metadata, TikaCoreProperties.SUBJECT, dcSchema); - extractMultilingualItems(metadata, TikaCoreProperties.RIGHTS, null, dcSchema); + if (dcSchema == null) { + return; } + extractDublinCoreSimpleItem(metadata, dcSchema, TikaCoreProperties.IDENTIFIER.getName(), TikaCoreProperties.IDENTIFIER, XMPDC.IDENTIFIER); + extractDublinCoreSimpleItem(metadata, dcSchema, TikaCoreProperties.SOURCE.getName(), TikaCoreProperties.SOURCE, XMPDC.SOURCE); + + extractDublinCoreListItems(metadata, dcSchema, TikaCoreProperties.CONTRIBUTOR.getName(), TikaCoreProperties.CONTRIBUTOR, XMPDC.CONTRIBUTOR); + extractDublinCoreListItems(metadata, dcSchema, TikaCoreProperties.CREATOR.getName(), TikaCoreProperties.CREATOR, XMPDC.CREATOR); + extractDublinCoreListItems(metadata, dcSchema, TikaCoreProperties.LANGUAGE.getName(), TikaCoreProperties.LANGUAGE, XMPDC.LANGUAGE); + extractDublinCoreListItems(metadata, dcSchema, TikaCoreProperties.PUBLISHER.getName(), TikaCoreProperties.PUBLISHER, XMPDC.PUBLISHER); + extractDublinCoreListItems(metadata, dcSchema, TikaCoreProperties.RELATION.getName(), TikaCoreProperties.RELATION, XMPDC.RELATION); + extractDublinCoreListItems(metadata, dcSchema, TikaCoreProperties.SUBJECT.getName(), TikaCoreProperties.SUBJECT, XMPDC.SUBJECT); + // finds only the first one?! + extractDublinCoreListItems(metadata, dcSchema, TikaCoreProperties.TYPE.getName(), TikaCoreProperties.TYPE, XMPDC.TYPE); + + extractMultilingualItems(metadata, dcSchema, TikaCoreProperties.DESCRIPTION.getName(), TikaCoreProperties.DESCRIPTION, XMPDC.DESCRIPTION); + extractMultilingualItems(metadata, dcSchema, TikaCoreProperties.RIGHTS.getName(), TikaCoreProperties.RIGHTS, XMPDC.RIGHTS); + extractMultilingualItems(metadata, dcSchema, TikaCoreProperties.TITLE.getName(), TikaCoreProperties.TITLE, XMPDC.TITLE); + } private static void extractPDFVT(XMPMetadata xmp, Metadata metadata) { @@ -261,9 +268,9 @@ public class PDMetadataExtractor { if (pdf == null) { return; } - setNotNull(PDF.PRODUCER, pdf.getProducer(), metadata); - setNotNull(Office.KEYWORDS, pdf.getKeywords(), metadata); - setNotNull(PDF.PDF_VERSION, pdf.getPDFVersion(), metadata); + setNotNull(pdf.getProducer(), metadata, PDF.PRODUCER, XMPPDF.PRODUCER); + setNotNull(pdf.getKeywords(), metadata, Office.KEYWORDS, XMPPDF.KEY_WORDS); + setNotNull(pdf.getPDFVersion(), metadata, PDF.PDF_VERSION, XMPPDF.PDF_VERSION); } private static void extractBasic(XMPMetadata xmp, Metadata metadata) { @@ -280,12 +287,11 @@ public class PDMetadataExtractor { if (basic == null) { return; } - //add the elements from the basic schema if they haven't already - //been extracted from dublin core - setNotNull(XMP.CREATOR_TOOL, basic.getCreatorTool(), metadata); - setNotNull(DublinCore.TITLE, basic.getTitle(), metadata); - setNotNull(XMP.ABOUT, basic.getAbout(), metadata); - setNotNull(XMP.LABEL, basic.getLabel(), metadata); + //add the elements from the basic schema + setNotNull(basic.getCreatorTool(), metadata, XMP.CREATOR_TOOL); + setNotNull(basic.getTitle(), metadata, DublinCore.TITLE, XMP.TITLE); + setNotNull(basic.getAbout(), metadata, XMP.ABOUT); + setNotNull(basic.getLabel(), metadata, XMP.LABEL); try { setNotNull(XMP.CREATE_DATE, basic.getCreateDate(), metadata); } catch (IOException e) { @@ -314,7 +320,7 @@ public class PDMetadataExtractor { metadata.add(XMP.ADVISORY, advisory); } } - setNotNull(XMP.NICKNAME, basic.getNickname(), metadata); + setNotNull(basic.getNickname(), metadata, XMP.NICKNAME); try { setNotNull(XMP.RATING, basic.getRating(), metadata); } catch (NumberFormatException e) { @@ -324,9 +330,13 @@ public class PDMetadataExtractor { //and figure out how to add that info } - private static void setNotNull(Property property, String value, Metadata metadata) { - if (metadata.get(property) == null && value != null && !value.isBlank()) { - metadata.set(property, decode(value)); + private static void setNotNull(String value, Metadata metadata, Property ... properties) { + if (value == null || value.isBlank()) { + return; + } + String decoded = decode(value); + for (Property property : properties) { + metadata.set(property, decoded); } } @@ -342,8 +352,11 @@ public class PDMetadataExtractor { } } - static void addNotNull(Property property, String value, Metadata metadata) { - if (! StringUtils.isBlank(value)) { + static void addNotNull(String value, Metadata metadata, Property ... properties) { + if (StringUtils.isBlank(value)) { + return; + } + for (Property property : properties) { metadata.add(property, value); } } @@ -375,45 +388,20 @@ public class PDMetadataExtractor { * values (see TIKA-1295) * * @param metadata - * @param property - * @param pdfBoxBaseline - * @param schema + * @param schema schema - must be non-null + * @param dcName dublin core name for the property to select from the xmp schema + * @param properties property names to set to this value */ - private static void extractMultilingualItems(Metadata metadata, Property property, - String pdfBoxBaseline, XMPSchema schema) { - //if schema is null, just go with pdfBoxBaseline - if (schema == null) { - if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { - addMetadata(metadata, property, pdfBoxBaseline); - } - return; - } - - for (String lang : schema.getLanguagePropertyLanguages(property.getName())) { - String value = schema.getLanguageProperty(property.getName(), lang); + private static void extractMultilingualItems(Metadata metadata, XMPSchema schema, String dcName, Property ... properties) { - if (value != null && value.length() > 0) { - //if you're going to add it below in the baseline addition, don't add it now - if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) { - continue; - } - addMetadata(metadata, property, value); - if (!property.isMultiValuePermitted()) { - return; + for (Property property : properties) { + for (String lang : schema.getLanguagePropertyLanguages(dcName)) { + String value = schema.getLanguageProperty(dcName, lang); + if (value != null && ! value.isBlank()) { + addMetadata(metadata, property, value); } } } - - if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { - //if we've already added something above and multivalue is not permitted - //return. - if (!property.isMultiValuePermitted()) { - if (metadata.get(property) != null) { - return; - } - } - addMetadata(metadata, property, pdfBoxBaseline); - } } @@ -426,22 +414,22 @@ public class PDMetadataExtractor { * <p/> * This relies on the property having a DublinCore compliant getName() * - * @param property - * @param dc * @param metadata + * @param dc schema - must be non-null + * @param dcName -- name of the dc property to read from the dc schema + * @param properties -- property to set for this value in the metadata object */ - private static void extractDublinCoreListItems(Metadata metadata, Property property, - XMPSchemaDublinCore dc) { - //if no dc, add baseline and return - if (dc == null) { - return; - } - List<String> items = getXMPBagOrSeqList(dc, property.getName()); + private static void extractDublinCoreListItems(Metadata metadata, + XMPSchemaDublinCore dc, String dcName, Property ... properties) { + + List<String> items = getXMPBagOrSeqList(dc, dcName); if (items == null) { return; } - for (String item : items) { - addMetadata(metadata, property, item); + for (Property property : properties) { + for (String item : items) { + addMetadata(metadata, property, item); + } } } @@ -450,35 +438,44 @@ public class PDMetadataExtractor { * <p/> * This relies on the property having a DublinCore compliant getName() * - * @param property - * @param dc * @param metadata + * @param dc schema - must be non-null + * @param dcName -- name of the dc property to read from the dc schema + * @param properties -- property to set for this value in the metadata object */ - private static void extractDublinCoreSimpleItem(Metadata metadata, Property property, - XMPSchemaDublinCore dc) { - if (dc == null) { - return; + private static void extractDublinCoreSimpleItem(Metadata metadata, + XMPSchemaDublinCore dc, String dcName, Property ... properties) { + + String textProperty = dc.getTextProperty(dcName); + for (Property property : properties) { + addMetadata(metadata, property, textProperty); } - String textProperty = dc.getTextProperty(property.getName()); - addMetadata(metadata, property, textProperty); } + /** + * Add non-null, non-empty and unique values to the Metadata object. If the property + * does not allow multiple values, silently fail to add values after the first. + * @param metadata + * @param property + * @param value + */ static void addMetadata(Metadata metadata, Property property, String value) { - if (value != null) { - String decoded = decode(value); - if (StringUtils.isBlank(decoded)) { - return; - } - if (property.isMultiValuePermitted() || metadata.get(property) == null) { - for (String v : metadata.getValues(property)) { - if (v.equals(decoded)) { - return; - } + if (value == null || value.isBlank()) { + return; + } + String decoded = decode(value); + if (StringUtils.isBlank(decoded)) { + return; + } + if (property.isMultiValuePermitted() || metadata.get(property) == null) { + for (String v : metadata.getValues(property)) { + if (v.equals(decoded)) { + return; } - metadata.add(property, decoded); } - //silently skip adding property that already exists if multiple values are not permitted + metadata.add(property, decoded); } + //silently skip adding property that already exists if multiple values are not permitted } static void addMetadata(Metadata metadata, String name, String value) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index d3f4f9f28..823e7bbba 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -55,10 +55,13 @@ import org.apache.tika.extractor.DocumentSelector; import org.apache.tika.metadata.Font; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.PDF; +import org.apache.tika.metadata.PagedText; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.TikaPagedText; import org.apache.tika.metadata.XMP; +import org.apache.tika.metadata.XMPDC; import org.apache.tika.metadata.XMPMM; +import org.apache.tika.metadata.XMPPDF; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.CompositeParser; @@ -1472,6 +1475,59 @@ public class PDFParserTest extends TikaTest { * TODO -- need to test signature extraction */ + @Test + public void testMetadataKeyPrecision() throws Exception { + //TIKA-4444 + List<Metadata> metadataList = getRecursiveMetadata("testPDF-TIKA-4444.pdf"); + Metadata m = metadataList.get(0); + + assertEquals("xmp-dc-contributor", m.get(TikaCoreProperties.CONTRIBUTOR)); + assertEquals("xmp-dc-creator", m.get(TikaCoreProperties.CREATOR)); + assertEquals("xmp-dc-description", m.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("application/pdf; version=1.3", m.get(TikaCoreProperties.FORMAT)); + assertEquals("xmp-dc-identifier", m.get(TikaCoreProperties.IDENTIFIER)); + assertEquals("xmp-dc-language", m.get(TikaCoreProperties.LANGUAGE)); + assertEquals("xmp-dc-publisher", m.get(TikaCoreProperties.PUBLISHER)); + assertEquals("xmp-dc-relation", m.get(TikaCoreProperties.RELATION)); + assertEquals("xmp-dc-rights", m.get(TikaCoreProperties.RIGHTS)); + assertEquals("xmp-dc-source", m.get(TikaCoreProperties.SOURCE)); + assertEquals("xmp-dc-title", m.get(TikaCoreProperties.TITLE)); + assertEquals("xmp-dc-type", m.get(TikaCoreProperties.TYPE)); + assertEquals("pdf-author", m.get(PDF.DOC_INFO_CREATOR)); + assertEquals("pdf-creator", m.get(PDF.DOC_INFO_CREATOR_TOOL)); + assertEquals("pdf-keywords", m.get(PDF.DOC_INFO_KEY_WORDS)); + assertTrue(m.get(PDF.DOC_INFO_MODIFICATION_DATE).startsWith("2025-06")); + assertEquals("pypdf-5.6.1", m.get(PDF.DOC_INFO_PRODUCER)); + assertEquals("pdf-subject", m.get(PDF.DOC_INFO_SUBJECT)); + assertEquals("pdf-title", m.get(PDF.DOC_INFO_TITLE)); + assertTrue(m.get(XMP.CREATE_DATE).startsWith("2025-02")); + assertEquals("xmp-xmp-creator-tool", m.get(XMP.CREATOR_TOOL)); + assertTrue(m.get(XMP.METADATA_DATE).startsWith("2025-02")); + assertTrue(m.get(XMP.MODIFY_DATE).startsWith("2025-02")); + assertEquals("xmp-dc-contributor", m.get(XMPDC.CONTRIBUTOR)); + assertEquals("xmp-dc-creator", m.get(XMPDC.CREATOR)); + assertEquals("xmp-dc-description", m.get(XMPDC.DESCRIPTION)); + assertEquals("xmp-dc-identifier", m.get(XMPDC.IDENTIFIER)); + assertEquals("xmp-dc-language", m.get(XMPDC.LANGUAGE)); + assertEquals("xmp-dc-publisher", m.get(XMPDC.PUBLISHER)); + assertEquals("xmp-dc-relation", m.get(XMPDC.RELATION)); + assertEquals("xmp-dc-rights", m.get(XMPDC.RIGHTS)); + assertEquals("xmp-dc-source", m.get(XMPDC.SOURCE)); + assertEquals("xmp-dc-subject", m.get(XMPDC.SUBJECT)); + assertEquals("xmp-dc-title", m.get(XMPDC.TITLE)); + assertEquals("xmp-dc-type", m.get(XMPDC.TYPE)); + assertEquals("xmp-pdf-keywords", m.get(XMPPDF.KEY_WORDS)); + assertEquals("xmp-pdf-version", m.get(XMPPDF.PDF_VERSION)); + assertEquals("xmp-pdf-producer", m.get(XMPPDF.PRODUCER)); + assertEquals("xmp-xmpmm-documentid", m.get(XMPMM.DOCUMENTID)); + assertEquals("13", m.get(PagedText.N_PAGES)); + + String[] expectedSubjectVals = new String[]{ + "xmp-pdf-keywords", "xmp-dc-subject", "pdf-keywords", "pdf-subject" + }; + assertArrayEquals(expectedSubjectVals, m.getValues(TikaCoreProperties.SUBJECT)); + } + /** @Test public void testWriteLimit() throws Exception { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF-TIKA-4444.pdf b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF-TIKA-4444.pdf new file mode 100644 index 000000000..b24ef757f Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF-TIKA-4444.pdf differ