This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new 79500639f TIKA-4466 -- allow multiple values for many DublinCore
values (#2308)
79500639f is described below
commit 79500639ff9b495091a11b0e8777bcb1b6aad915
Author: Tim Allison <[email protected]>
AuthorDate: Wed Aug 20 08:55:19 2025 -0400
TIKA-4466 -- allow multiple values for many DublinCore values (#2308)
* TIKA-4466 -- allow multiple values for many DublinCore values
(cherry picked from commit dc69e39e332adee20c54b7e8db040e4b771e8fdc)
---
CHANGES.txt | 7 ++++
.../java/org/apache/tika/metadata/DublinCore.java | 20 +++++-----
.../apache/tika/parser/epub/EpubParserTest.java | 42 +++++++++++++++++++++
.../testEPUB_multi-metadata-vals.epub | Bin 0 -> 571926 bytes
.../tika/parser/pdf/PDMetadataExtractor.java | 16 +++++---
.../org/apache/tika/parser/pdf/PDFParserTest.java | 12 +++---
.../org/apache/tika/parser/xml/DcXMLParser.java | 1 +
.../apache/tika/xmp/convert/AbstractConverter.java | 2 +-
.../java/org/apache/tika/xmp/TikaToXMPTest.java | 4 +-
9 files changed, 79 insertions(+), 25 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index f3b5f6e97..e37862317 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,9 +1,16 @@
+Release 3.3.0 - ???
+
+ * Allow multiple values for many Dublin Core values (TIKA-4466).
+
Release 3.2.2 - 8/6/2025
* Improve detection of encrypted ODT files (TIKA-4459)
* Dependency updates (TIKA-4455).
+ * Fix concurrency bug in TikaToXMP (TIKA-4393)
+
+
Release 3.2.1 - 6/26/2025
* Fix POIFSContainerDetector regression when wrapping an InputStream in
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
index a4e32cb8b..283080a0d 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
@@ -57,7 +57,7 @@ public interface DublinCore {
* the Digital Object Identifier (DOI) and the International Standard
* Book Number (ISBN).
*/
- Property IDENTIFIER = Property.internalText(
+ Property IDENTIFIER = Property.internalTextBag(
PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"identifier");
/**
@@ -85,7 +85,7 @@ public interface DublinCore {
* appropriate, named places or time periods be used in preference to
* numeric identifiers such as sets of coordinates or date ranges.
*/
- Property COVERAGE = Property.internalText(
+ Property COVERAGE = Property.internalTextBag(
PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"coverage");
/**
@@ -118,7 +118,7 @@ public interface DublinCore {
* a graphical representation of content or a free-text account of
* the content.
*/
- Property DESCRIPTION = Property.internalText(
+ Property DESCRIPTION = Property.internalTextBag(
PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"description");
/**
@@ -128,7 +128,7 @@ public interface DublinCore {
* tags with optional subtags. Examples include "en" or "eng" for English,
* "akk" for Akkadian, and "en-GB" for English used in the United Kingdom.
*/
- Property LANGUAGE = Property.internalText(
+ Property LANGUAGE = Property.internalTextBag(
PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"language");
/**
@@ -136,7 +136,7 @@ public interface DublinCore {
* a Publisher include a person, an organisation, or a service. Typically,
* the name of a Publisher should be used to indicate the entity.
*/
- Property PUBLISHER = Property.internalText(
+ Property PUBLISHER = Property.internalTextBag(
PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"publisher");
/**
@@ -144,7 +144,7 @@ public interface DublinCore {
* reference the resource by means of a string or number conforming to
* a formal identification system.
*/
- Property RELATION = Property.internalText(
+ Property RELATION = Property.internalTextBag(
PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"relation");
/**
@@ -156,7 +156,7 @@ public interface DublinCore {
* is absent, no assumptions can be made about the status of these and
* other rights with respect to the resource.
*/
- Property RIGHTS = Property.internalText(
+ Property RIGHTS = Property.internalTextBag(
PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"rights");
/**
@@ -166,7 +166,7 @@ public interface DublinCore {
* means of a string or number conforming to a formal identification
* system.
*/
- Property SOURCE = Property.internalText(
+ Property SOURCE = Property.internalTextBag(
PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"source");
/**
@@ -183,7 +183,7 @@ public interface DublinCore {
* A name given to the resource. Typically, a Title will be a name by
* which the resource is formally known.
*/
- Property TITLE = Property.internalText(
+ Property TITLE = Property.internalTextBag(
PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"title");
/**
@@ -194,7 +194,7 @@ public interface DublinCore {
* [DCMITYPE]). To describe the physical or digital manifestation of
* the resource, use the Format element.
*/
- Property TYPE = Property.internalText(
+ Property TYPE = Property.internalTextBag(
PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"type");
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
index 57e6a9a3b..ae9305a49 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
@@ -18,9 +18,13 @@ package org.apache.tika.parser.epub;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.InputStream;
+import java.util.Arrays;
+import java.util.HashSet;
import java.util.List;
+import java.util.Set;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
@@ -29,6 +33,7 @@ import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Epub;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
@@ -131,4 +136,41 @@ public class EpubParserTest extends TikaTest {
List<Metadata> metadataList =
getRecursiveMetadata("cole-voyage-of-life.epub");
assertEquals("pre-paginated",
metadataList.get(0).get(Epub.RENDITION_LAYOUT));
}
+
+ @Test
+ public void testMultipleMetadataValues() throws Exception {
+ //TIKA_4466
+ List<Metadata> metadataList =
getRecursiveMetadata("testEPUB_multi-metadata-vals.epub");
+ Set<String> publishers = Set.of("Standard Ebooks", "Guternberg");
+ Set<String> titles = Set.of("The Inheritors", "An Extravagant Story",
"The Inheritors: An Extravagant Story");
+ Set<String> contributors = Set.of("The League of Moveable Type",
"zikasak", "William Holyoake", "Clare Boothby",
+ "Graeme Mackreth", "Distributed Proofreaders", "Szymon Szott",
"David Reimer");
+ Set<String> creators = Set.of("Joseph Conrad", "Ford Madox Ford");
+ Set<String> languages = Set.of("en-GB", "en-US");
+ Set<String> descriptions = Set.of("A young writer dabbling in
journalism meets a strange, otherworldly woman with long-term political goals.",
+ "additional description");
+ Set<String> sources = Set.of("https://www.gutenberg.org/ebooks/14888",
"https://archive.org/details/inheritorsanext01fordgoog/");
+ Set<String> identifiers =
Set.of("https://standardebooks.org/ebooks/joseph-conrad_ford-madox-ford/the-inheritors",
+ "isbn:0571225470");
+ Set<String> subjects = Set.of("Science fiction");
+
+ Metadata m = metadataList.get(0);
+ assertEquals(publishers, set(m, TikaCoreProperties.PUBLISHER));
+ assertEquals(titles, set(m, TikaCoreProperties.TITLE));
+ assertEquals(contributors, set(m, TikaCoreProperties.CONTRIBUTOR));
+ assertEquals(creators, set(m, TikaCoreProperties.CREATOR));
+ assertEquals(languages, set(m, TikaCoreProperties.LANGUAGE));
+ assertEquals(descriptions, set(m, TikaCoreProperties.DESCRIPTION));
+ assertEquals(sources, set(m, TikaCoreProperties.SOURCE));
+ assertEquals(identifiers, set(m, TikaCoreProperties.IDENTIFIER));
+ assertEquals(subjects, set(m, TikaCoreProperties.SUBJECT));
+
+ assertEquals(2, m.getValues(TikaCoreProperties.RIGHTS).length);
+ assertTrue(m.get(TikaCoreProperties.RIGHTS).startsWith("The source
text and artwork"));
+ assertEquals("test rights", m.getValues(TikaCoreProperties.RIGHTS)[1]);
+ }
+
+ private Set<String> set(Metadata m, Property property) {
+ return new HashSet<>(Arrays.asList(m.getValues(property)));
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testEPUB_multi-metadata-vals.epub
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testEPUB_multi-metadata-vals.epub
new file mode 100644
index 000000000..b02b1af12
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/testEPUB_multi-metadata-vals.epub
differ
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
index 9b497cb8c..8ffb16a62 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
@@ -118,21 +118,24 @@ public class PDMetadataExtractor {
if (dcSchema == null) {
return;
}
- extractDublinCoreSimpleItem(metadata, dcSchema,
TikaCoreProperties.IDENTIFIER.getName(), TikaCoreProperties.IDENTIFIER,
XMPDC.IDENTIFIER);
- extractDublinCoreSimpleItem(metadata, dcSchema,
TikaCoreProperties.SOURCE.getName(), TikaCoreProperties.SOURCE, XMPDC.SOURCE);
-
extractDublinCoreListItems(metadata, dcSchema,
TikaCoreProperties.CONTRIBUTOR.getName(), TikaCoreProperties.CONTRIBUTOR,
XMPDC.CONTRIBUTOR);
+ extractDublinCoreSimpleItem(metadata, dcSchema,
TikaCoreProperties.COVERAGE.getName(), TikaCoreProperties.COVERAGE,
XMPDC.COVERAGE);
extractDublinCoreListItems(metadata, dcSchema,
TikaCoreProperties.CREATOR.getName(), TikaCoreProperties.CREATOR,
XMPDC.CREATOR);
+
+ extractDublinCoreListItems(metadata, dcSchema, XMPDC.DATE.getName(),
XMPDC.DATE);
+ extractMultilingualItems(metadata, dcSchema,
TikaCoreProperties.DESCRIPTION.getName(), TikaCoreProperties.DESCRIPTION,
XMPDC.DESCRIPTION);
+ extractDublinCoreListItems(metadata, dcSchema, XMPDC.FORMAT.getName(),
XMPDC.FORMAT);
+ extractDublinCoreSimpleItem(metadata, dcSchema,
TikaCoreProperties.IDENTIFIER.getName(), TikaCoreProperties.IDENTIFIER,
XMPDC.IDENTIFIER);
extractDublinCoreListItems(metadata, dcSchema,
TikaCoreProperties.LANGUAGE.getName(), TikaCoreProperties.LANGUAGE,
XMPDC.LANGUAGE);
extractDublinCoreListItems(metadata, dcSchema,
TikaCoreProperties.PUBLISHER.getName(), TikaCoreProperties.PUBLISHER,
XMPDC.PUBLISHER);
extractDublinCoreListItems(metadata, dcSchema,
TikaCoreProperties.RELATION.getName(), TikaCoreProperties.RELATION,
XMPDC.RELATION);
+ extractMultilingualItems(metadata, dcSchema,
TikaCoreProperties.RIGHTS.getName(), TikaCoreProperties.RIGHTS, XMPDC.RIGHTS);
+ extractDublinCoreSimpleItem(metadata, dcSchema,
TikaCoreProperties.SOURCE.getName(), TikaCoreProperties.SOURCE, XMPDC.SOURCE);
extractDublinCoreListItems(metadata, dcSchema,
TikaCoreProperties.SUBJECT.getName(), TikaCoreProperties.SUBJECT,
XMPDC.SUBJECT);
+ extractMultilingualItems(metadata, dcSchema,
TikaCoreProperties.TITLE.getName(), TikaCoreProperties.TITLE, XMPDC.TITLE);
// finds only the first one?!
extractDublinCoreListItems(metadata, dcSchema,
TikaCoreProperties.TYPE.getName(), TikaCoreProperties.TYPE, XMPDC.TYPE);
- extractMultilingualItems(metadata, dcSchema,
TikaCoreProperties.DESCRIPTION.getName(), TikaCoreProperties.DESCRIPTION,
XMPDC.DESCRIPTION);
- extractMultilingualItems(metadata, dcSchema,
TikaCoreProperties.RIGHTS.getName(), TikaCoreProperties.RIGHTS, XMPDC.RIGHTS);
- extractMultilingualItems(metadata, dcSchema,
TikaCoreProperties.TITLE.getName(), TikaCoreProperties.TITLE, XMPDC.TITLE);
}
@@ -399,6 +402,7 @@ public class PDMetadataExtractor {
String value = schema.getLanguageProperty(dcName, lang);
if (value != null && ! value.isBlank()) {
addMetadata(metadata, property, value);
+ addMetadata(metadata, property.getName() + ":" + lang,
value);
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 823e7bbba..a7fb92621 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -711,15 +711,15 @@ public class PDFParserTest extends TikaTest {
}
- //STUB test for once TIKA-1295 is fixed
@Test
public void testMultipleTitles() throws Exception {
+ //TIKA-1295 and TIKA-4466
XMLResult r = getXML("testPDFTripleLangTitle.pdf");
- //TODO: add other tests as part of TIKA-1295
- //dc:title-fr-ca (or whatever we decide) should be "Bonjour World"
- //dc:title-zh-ch is currently hosed...bug in PDFBox while injecting
xmp?
- //
- assertEquals("Hello World", r.metadata.get("dc:title"));
+ String[] titles = new String[]{"Hello World", "Bonjour World", "你好世界"};
+ assertArrayEquals(titles,
r.metadata.getValues(TikaCoreProperties.TITLE));
+ assertEquals("Hello World",
r.metadata.get(TikaCoreProperties.TITLE.getName() + ":x-default"));
+ assertEquals("Bonjour World",
r.metadata.get(TikaCoreProperties.TITLE.getName() + ":fr-ca"));
+ assertEquals("你好世界", r.metadata.get(TikaCoreProperties.TITLE.getName()
+ ":zh-cn"));
}
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
index 172bf3b8e..a3c50c6cb 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
@@ -54,6 +54,7 @@ public class DcXMLParser extends XMLParser {
getDublinCoreHandler(metadata, TikaCoreProperties.FORMAT,
"format"),
getDublinCoreHandler(metadata, TikaCoreProperties.IDENTIFIER,
"identifier"),
getDublinCoreHandler(metadata, TikaCoreProperties.LANGUAGE,
"language"),
+ getDublinCoreHandler(metadata, TikaCoreProperties.SOURCE,
"source"),
getDublinCoreHandler(metadata, TikaCoreProperties.RIGHTS,
"rights"));
}
diff --git
a/tika-xmp/src/main/java/org/apache/tika/xmp/convert/AbstractConverter.java
b/tika-xmp/src/main/java/org/apache/tika/xmp/convert/AbstractConverter.java
index ed03e2a0e..f7072665d 100644
--- a/tika-xmp/src/main/java/org/apache/tika/xmp/convert/AbstractConverter.java
+++ b/tika-xmp/src/main/java/org/apache/tika/xmp/convert/AbstractConverter.java
@@ -79,7 +79,7 @@ public abstract class AbstractConverter implements
ITikaToXMPConverter {
registry.registerNamespace(namespace.uri, namespace.prefix);
} catch (XMPException e) {
throw new TikaException(
- "Namespace needed by converter could not be
registiered with XMPCore", e);
+ "Namespace needed by converter could not be registered
with XMPCore", e);
}
}
}
diff --git a/tika-xmp/src/test/java/org/apache/tika/xmp/TikaToXMPTest.java
b/tika-xmp/src/test/java/org/apache/tika/xmp/TikaToXMPTest.java
index da25605f7..427ed1989 100644
--- a/tika-xmp/src/test/java/org/apache/tika/xmp/TikaToXMPTest.java
+++ b/tika-xmp/src/test/java/org/apache/tika/xmp/TikaToXMPTest.java
@@ -130,7 +130,7 @@ public class TikaToXMPTest {
// general metadata is converted
// check simple property
- XMPProperty prop = xmp.getProperty(XMPConst.NS_DC, "language");
+ XMPProperty prop = xmp.getArrayItem(XMPConst.NS_DC, "language", 1);
assertNotNull(prop);
assertEquals("language", prop.getValue());
@@ -139,7 +139,7 @@ public class TikaToXMPTest {
assertNotNull(prop);
assertEquals("title", prop.getValue());
- // OOXML one is not, the namespace has also not been registiered as
the converter has not
+ // OOXML one is not, the namespace has also not been registered as the
converter has not
// been used
XMPMetaFactory.getSchemaRegistry()
.registerNamespace(OfficeOpenXMLCore.NAMESPACE_URI,
OfficeOpenXMLCore.PREFIX);