This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_3x by this push:
     new 79500639f TIKA-4466 -- allow multiple values for many DublinCore 
values (#2308)
79500639f is described below

commit 79500639ff9b495091a11b0e8777bcb1b6aad915
Author: Tim Allison <[email protected]>
AuthorDate: Wed Aug 20 08:55:19 2025 -0400

    TIKA-4466 -- allow multiple values for many DublinCore values (#2308)
    
    * TIKA-4466 -- allow multiple values for many DublinCore values
    
    (cherry picked from commit dc69e39e332adee20c54b7e8db040e4b771e8fdc)
---
 CHANGES.txt                                        |   7 ++++
 .../java/org/apache/tika/metadata/DublinCore.java  |  20 +++++-----
 .../apache/tika/parser/epub/EpubParserTest.java    |  42 +++++++++++++++++++++
 .../testEPUB_multi-metadata-vals.epub              | Bin 0 -> 571926 bytes
 .../tika/parser/pdf/PDMetadataExtractor.java       |  16 +++++---
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  12 +++---
 .../org/apache/tika/parser/xml/DcXMLParser.java    |   1 +
 .../apache/tika/xmp/convert/AbstractConverter.java |   2 +-
 .../java/org/apache/tika/xmp/TikaToXMPTest.java    |   4 +-
 9 files changed, 79 insertions(+), 25 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index f3b5f6e97..e37862317 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,9 +1,16 @@
+Release 3.3.0 - ???
+
+  * Allow multiple values for many Dublin Core values (TIKA-4466).
+
 Release 3.2.2 - 8/6/2025
 
   * Improve detection of encrypted ODT files (TIKA-4459)
 
   * Dependency updates (TIKA-4455).
 
+   * Fix concurrency bug in TikaToXMP (TIKA-4393)
+
+
 Release 3.2.1 - 6/26/2025
 
   * Fix POIFSContainerDetector regression when wrapping an InputStream in
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java 
b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
index a4e32cb8b..283080a0d 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
@@ -57,7 +57,7 @@ public interface DublinCore {
      * the Digital Object Identifier (DOI) and the International Standard
      * Book Number (ISBN).
      */
-    Property IDENTIFIER = Property.internalText(
+    Property IDENTIFIER = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"identifier");
 
     /**
@@ -85,7 +85,7 @@ public interface DublinCore {
      * appropriate, named places or time periods be used in preference to
      * numeric identifiers such as sets of coordinates or date ranges.
      */
-    Property COVERAGE = Property.internalText(
+    Property COVERAGE = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"coverage");
 
     /**
@@ -118,7 +118,7 @@ public interface DublinCore {
      * a graphical representation of content or a free-text account of
      * the content.
      */
-    Property DESCRIPTION = Property.internalText(
+    Property DESCRIPTION = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"description");
 
     /**
@@ -128,7 +128,7 @@ public interface DublinCore {
      * tags with optional subtags. Examples include "en" or "eng" for English,
      * "akk" for Akkadian, and "en-GB" for English used in the United Kingdom.
      */
-    Property LANGUAGE = Property.internalText(
+    Property LANGUAGE = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"language");
 
     /**
@@ -136,7 +136,7 @@ public interface DublinCore {
      * a Publisher include a person, an organisation, or a service. Typically,
      * the name of a Publisher should be used to indicate the entity.
      */
-    Property PUBLISHER = Property.internalText(
+    Property PUBLISHER = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"publisher");
 
     /**
@@ -144,7 +144,7 @@ public interface DublinCore {
      * reference the resource by means of a string or number conforming to
      * a formal identification system.
      */
-    Property RELATION = Property.internalText(
+    Property RELATION = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"relation");
 
     /**
@@ -156,7 +156,7 @@ public interface DublinCore {
      * is absent, no assumptions can be made about the status of these and
      * other rights with respect to the resource.
      */
-    Property RIGHTS = Property.internalText(
+    Property RIGHTS = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"rights");
 
     /**
@@ -166,7 +166,7 @@ public interface DublinCore {
      * means of a string or number conforming to a formal identification
      * system.
      */
-    Property SOURCE = Property.internalText(
+    Property SOURCE = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"source");
 
     /**
@@ -183,7 +183,7 @@ public interface DublinCore {
      * A name given to the resource. Typically, a Title will be a name by
      * which the resource is formally known.
      */
-    Property TITLE = Property.internalText(
+    Property TITLE = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"title");
 
     /**
@@ -194,7 +194,7 @@ public interface DublinCore {
      * [DCMITYPE]). To describe the physical or digital manifestation of
      * the resource, use the Format element.
      */
-    Property TYPE = Property.internalText(
+    Property TYPE = Property.internalTextBag(
             PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"type");
 
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
index 57e6a9a3b..ae9305a49 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
@@ -18,9 +18,13 @@ package org.apache.tika.parser.epub;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.io.InputStream;
+import java.util.Arrays;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
@@ -29,6 +33,7 @@ import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Epub;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
@@ -131,4 +136,41 @@ public class EpubParserTest extends TikaTest {
         List<Metadata> metadataList = 
getRecursiveMetadata("cole-voyage-of-life.epub");
         assertEquals("pre-paginated", 
metadataList.get(0).get(Epub.RENDITION_LAYOUT));
     }
+
+    @Test
+    public void testMultipleMetadataValues() throws Exception {
+        //TIKA_4466
+        List<Metadata> metadataList = 
getRecursiveMetadata("testEPUB_multi-metadata-vals.epub");
+        Set<String> publishers = Set.of("Standard Ebooks", "Guternberg");
+        Set<String> titles = Set.of("The Inheritors", "An Extravagant Story", 
"The Inheritors: An Extravagant Story");
+        Set<String> contributors = Set.of("The League of Moveable Type", 
"zikasak", "William Holyoake", "Clare Boothby",
+                "Graeme Mackreth", "Distributed Proofreaders", "Szymon Szott", 
"David Reimer");
+        Set<String> creators = Set.of("Joseph Conrad", "Ford Madox Ford");
+        Set<String> languages = Set.of("en-GB", "en-US");
+        Set<String> descriptions = Set.of("A young writer dabbling in 
journalism meets a strange, otherworldly woman with long-term political goals.",
+                "additional description");
+        Set<String> sources = Set.of("https://www.gutenberg.org/ebooks/14888";, 
"https://archive.org/details/inheritorsanext01fordgoog/";);
+        Set<String> identifiers = 
Set.of("https://standardebooks.org/ebooks/joseph-conrad_ford-madox-ford/the-inheritors";,
+                "isbn:0571225470");
+        Set<String> subjects = Set.of("Science fiction");
+
+        Metadata m = metadataList.get(0);
+        assertEquals(publishers, set(m, TikaCoreProperties.PUBLISHER));
+        assertEquals(titles, set(m, TikaCoreProperties.TITLE));
+        assertEquals(contributors, set(m, TikaCoreProperties.CONTRIBUTOR));
+        assertEquals(creators, set(m, TikaCoreProperties.CREATOR));
+        assertEquals(languages, set(m, TikaCoreProperties.LANGUAGE));
+        assertEquals(descriptions, set(m, TikaCoreProperties.DESCRIPTION));
+        assertEquals(sources, set(m, TikaCoreProperties.SOURCE));
+        assertEquals(identifiers, set(m, TikaCoreProperties.IDENTIFIER));
+        assertEquals(subjects, set(m, TikaCoreProperties.SUBJECT));
+
+        assertEquals(2, m.getValues(TikaCoreProperties.RIGHTS).length);
+        assertTrue(m.get(TikaCoreProperties.RIGHTS).startsWith("The source 
text and artwork"));
+        assertEquals("test rights", m.getValues(TikaCoreProperties.RIGHTS)[1]);
+    }
+
+    private Set<String> set(Metadata m, Property property) {
+        return new HashSet<>(Arrays.asList(m.getValues(property)));
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testEPUB_multi-metadata-vals.epub
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testEPUB_multi-metadata-vals.epub
new file mode 100644
index 000000000..b02b1af12
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testEPUB_multi-metadata-vals.epub
 differ
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
index 9b497cb8c..8ffb16a62 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
@@ -118,21 +118,24 @@ public class PDMetadataExtractor {
         if (dcSchema == null) {
             return;
         }
-        extractDublinCoreSimpleItem(metadata, dcSchema, 
TikaCoreProperties.IDENTIFIER.getName(), TikaCoreProperties.IDENTIFIER, 
XMPDC.IDENTIFIER);
-        extractDublinCoreSimpleItem(metadata, dcSchema, 
TikaCoreProperties.SOURCE.getName(), TikaCoreProperties.SOURCE, XMPDC.SOURCE);
-
         extractDublinCoreListItems(metadata, dcSchema, 
TikaCoreProperties.CONTRIBUTOR.getName(), TikaCoreProperties.CONTRIBUTOR, 
XMPDC.CONTRIBUTOR);
+        extractDublinCoreSimpleItem(metadata, dcSchema, 
TikaCoreProperties.COVERAGE.getName(), TikaCoreProperties.COVERAGE, 
XMPDC.COVERAGE);
         extractDublinCoreListItems(metadata, dcSchema, 
TikaCoreProperties.CREATOR.getName(), TikaCoreProperties.CREATOR, 
XMPDC.CREATOR);
+
+        extractDublinCoreListItems(metadata, dcSchema, XMPDC.DATE.getName(), 
XMPDC.DATE);
+        extractMultilingualItems(metadata, dcSchema, 
TikaCoreProperties.DESCRIPTION.getName(), TikaCoreProperties.DESCRIPTION, 
XMPDC.DESCRIPTION);
+        extractDublinCoreListItems(metadata, dcSchema, XMPDC.FORMAT.getName(), 
XMPDC.FORMAT);
+        extractDublinCoreSimpleItem(metadata, dcSchema, 
TikaCoreProperties.IDENTIFIER.getName(), TikaCoreProperties.IDENTIFIER, 
XMPDC.IDENTIFIER);
         extractDublinCoreListItems(metadata, dcSchema, 
TikaCoreProperties.LANGUAGE.getName(), TikaCoreProperties.LANGUAGE, 
XMPDC.LANGUAGE);
         extractDublinCoreListItems(metadata, dcSchema, 
TikaCoreProperties.PUBLISHER.getName(), TikaCoreProperties.PUBLISHER, 
XMPDC.PUBLISHER);
         extractDublinCoreListItems(metadata, dcSchema, 
TikaCoreProperties.RELATION.getName(), TikaCoreProperties.RELATION, 
XMPDC.RELATION);
+        extractMultilingualItems(metadata, dcSchema, 
TikaCoreProperties.RIGHTS.getName(), TikaCoreProperties.RIGHTS, XMPDC.RIGHTS);
+        extractDublinCoreSimpleItem(metadata, dcSchema, 
TikaCoreProperties.SOURCE.getName(), TikaCoreProperties.SOURCE, XMPDC.SOURCE);
         extractDublinCoreListItems(metadata, dcSchema, 
TikaCoreProperties.SUBJECT.getName(), TikaCoreProperties.SUBJECT, 
XMPDC.SUBJECT);
+        extractMultilingualItems(metadata, dcSchema, 
TikaCoreProperties.TITLE.getName(), TikaCoreProperties.TITLE, XMPDC.TITLE);
         // finds only the first one?!
         extractDublinCoreListItems(metadata, dcSchema, 
TikaCoreProperties.TYPE.getName(), TikaCoreProperties.TYPE, XMPDC.TYPE);
 
-        extractMultilingualItems(metadata, dcSchema, 
TikaCoreProperties.DESCRIPTION.getName(), TikaCoreProperties.DESCRIPTION, 
XMPDC.DESCRIPTION);
-        extractMultilingualItems(metadata, dcSchema, 
TikaCoreProperties.RIGHTS.getName(), TikaCoreProperties.RIGHTS, XMPDC.RIGHTS);
-        extractMultilingualItems(metadata, dcSchema, 
TikaCoreProperties.TITLE.getName(), TikaCoreProperties.TITLE, XMPDC.TITLE);
 
     }
 
@@ -399,6 +402,7 @@ public class PDMetadataExtractor {
                 String value = schema.getLanguageProperty(dcName, lang);
                 if (value != null && ! value.isBlank()) {
                     addMetadata(metadata, property, value);
+                    addMetadata(metadata, property.getName() + ":" + lang, 
value);
                 }
             }
         }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 823e7bbba..a7fb92621 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -711,15 +711,15 @@ public class PDFParserTest extends TikaTest {
 
     }
 
-    //STUB test for once TIKA-1295 is fixed
     @Test
     public void testMultipleTitles() throws Exception {
+        //TIKA-1295 and TIKA-4466
         XMLResult r = getXML("testPDFTripleLangTitle.pdf");
-        //TODO: add other tests as part of TIKA-1295
-        //dc:title-fr-ca (or whatever we decide) should be "Bonjour World"
-        //dc:title-zh-ch is currently hosed...bug in PDFBox while injecting 
xmp?
-        //
-        assertEquals("Hello World", r.metadata.get("dc:title"));
+        String[] titles = new String[]{"Hello World", "Bonjour World", "你好世界"};
+        assertArrayEquals(titles, 
r.metadata.getValues(TikaCoreProperties.TITLE));
+        assertEquals("Hello World", 
r.metadata.get(TikaCoreProperties.TITLE.getName() + ":x-default"));
+        assertEquals("Bonjour World", 
r.metadata.get(TikaCoreProperties.TITLE.getName() + ":fr-ca"));
+        assertEquals("你好世界", r.metadata.get(TikaCoreProperties.TITLE.getName() 
+ ":zh-cn"));
     }
 
     @Test
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
index 172bf3b8e..a3c50c6cb 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
@@ -54,6 +54,7 @@ public class DcXMLParser extends XMLParser {
                 getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT, 
"format"),
                 getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER, 
"identifier"),
                 getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE, 
"language"),
+                getDublinCoreHandler(metadata, TikaCoreProperties.SOURCE, 
"source"),
                 getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS, 
"rights"));
     }
 
diff --git 
a/tika-xmp/src/main/java/org/apache/tika/xmp/convert/AbstractConverter.java 
b/tika-xmp/src/main/java/org/apache/tika/xmp/convert/AbstractConverter.java
index ed03e2a0e..f7072665d 100644
--- a/tika-xmp/src/main/java/org/apache/tika/xmp/convert/AbstractConverter.java
+++ b/tika-xmp/src/main/java/org/apache/tika/xmp/convert/AbstractConverter.java
@@ -79,7 +79,7 @@ public abstract class AbstractConverter implements 
ITikaToXMPConverter {
                 registry.registerNamespace(namespace.uri, namespace.prefix);
             } catch (XMPException e) {
                 throw new TikaException(
-                        "Namespace needed by converter could not be 
registiered with XMPCore", e);
+                        "Namespace needed by converter could not be registered 
with XMPCore", e);
             }
         }
     }
diff --git a/tika-xmp/src/test/java/org/apache/tika/xmp/TikaToXMPTest.java 
b/tika-xmp/src/test/java/org/apache/tika/xmp/TikaToXMPTest.java
index da25605f7..427ed1989 100644
--- a/tika-xmp/src/test/java/org/apache/tika/xmp/TikaToXMPTest.java
+++ b/tika-xmp/src/test/java/org/apache/tika/xmp/TikaToXMPTest.java
@@ -130,7 +130,7 @@ public class TikaToXMPTest {
 
         // general metadata is converted
         // check simple property
-        XMPProperty prop = xmp.getProperty(XMPConst.NS_DC, "language");
+        XMPProperty prop = xmp.getArrayItem(XMPConst.NS_DC, "language", 1);
         assertNotNull(prop);
         assertEquals("language", prop.getValue());
 
@@ -139,7 +139,7 @@ public class TikaToXMPTest {
         assertNotNull(prop);
         assertEquals("title", prop.getValue());
 
-        // OOXML one is not, the namespace has also not been registiered as 
the converter has not
+        // OOXML one is not, the namespace has also not been registered as the 
converter has not
         // been used
         XMPMetaFactory.getSchemaRegistry()
                 .registerNamespace(OfficeOpenXMLCore.NAMESPACE_URI, 
OfficeOpenXMLCore.PREFIX);

Reply via email to