This is an automated email from the ASF dual-hosted git repository.
tilman pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new d3a58c086 TIKA-4442: collect 6 more dublin core properties
d3a58c086 is described below
commit d3a58c086e720591b0f7183631b8ce6283f31711
Author: Tilman Hausherr <[email protected]>
AuthorDate: Tue Jun 24 14:51:43 2025 +0200
TIKA-4442: collect 6 more dublin core properties
---
.../tika/parser/pdf/PDMetadataExtractor.java | 23 ++++++++++++++++++++++
1 file changed, 23 insertions(+)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
index ce1109fc1..a3fac7728 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
@@ -118,6 +118,12 @@ public class PDMetadataExtractor {
extractDublinCoreListItems(metadata,
TikaCoreProperties.CONTRIBUTOR, dcSchema);
extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR,
dcSchema);
extractMultilingualItems(metadata, TikaCoreProperties.TITLE, null,
dcSchema);
+ extractDublinCoreListItems(metadata, TikaCoreProperties.TYPE,
dcSchema); // finds only the first one?!
+ extractDublinCoreSimpleItem(metadata,
TikaCoreProperties.IDENTIFIER, dcSchema);
+ extractDublinCoreListItems(metadata, TikaCoreProperties.LANGUAGE,
dcSchema);
+ extractDublinCoreListItems(metadata, TikaCoreProperties.PUBLISHER,
dcSchema);
+ extractDublinCoreListItems(metadata, TikaCoreProperties.RELATION,
dcSchema);
+ extractDublinCoreSimpleItem(metadata, TikaCoreProperties.SOURCE,
dcSchema);
}
}
@@ -437,6 +443,23 @@ public class PDMetadataExtractor {
}
}
+ /**
+ * This tries to read a string from a particular property in
XMPSchemaDublinCore.
+ * <p/>
+ * This relies on the property having a DublinCore compliant getName()
+ *
+ * @param property
+ * @param dc
+ * @param metadata
+ */
+ private static void extractDublinCoreSimpleItem(Metadata metadata,
Property property,
+ XMPSchemaDublinCore dc) {
+ if (dc == null) {
+ return;
+ }
+ String textProperty = dc.getTextProperty(property.getName());
+ addMetadata(metadata, property, textProperty);
+ }
static void addMetadata(Metadata metadata, Property property, String
value) {
if (value != null) {