This is an automated email from the ASF dual-hosted git repository.
tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 24f13c446 TIKA-4444: get subject from xmp + add tests
24f13c446 is described below
commit 24f13c4466392685cda663a16afe5d53796dd92f
Author: Tilman Hausherr <[email protected]>
AuthorDate: Wed Jun 25 15:08:50 2025 +0200
TIKA-4444: get subject from xmp + add tests
---
.../java/org/apache/tika/parser/pdf/PDMetadataExtractor.java | 1 +
.../test/java/org/apache/tika/parser/pdf/CustomTikaXMPTest.java | 9 +++++++++
2 files changed, 10 insertions(+)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
index a3fac7728..a8d35e1ba 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
@@ -124,6 +124,7 @@ public class PDMetadataExtractor {
extractDublinCoreListItems(metadata, TikaCoreProperties.PUBLISHER,
dcSchema);
extractDublinCoreListItems(metadata, TikaCoreProperties.RELATION,
dcSchema);
extractDublinCoreSimpleItem(metadata, TikaCoreProperties.SOURCE,
dcSchema);
+ extractDublinCoreListItems(metadata, TikaCoreProperties.SUBJECT,
dcSchema);
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/CustomTikaXMPTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/CustomTikaXMPTest.java
index 9b7907b4d..dc6e3b3b4 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/CustomTikaXMPTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/CustomTikaXMPTest.java
@@ -59,6 +59,9 @@ public class CustomTikaXMPTest extends TikaTest {
public void testPDFUA() throws Exception {
Metadata metadata = extract("testPDFUA.xmp");
assertEquals(1, metadata.getInt(PDF.PDFUAID_PART));
+ String[] subjects = metadata.getValues(TikaCoreProperties.SUBJECT);
+ assertEquals("keywords", subjects[0]);
+ assertEquals("subject", subjects[1]);
}
@Test
@@ -83,6 +86,12 @@ public class CustomTikaXMPTest extends TikaTest {
assertEquals("International Union of Thinkology",
metadata.get(TikaCoreProperties.PUBLISHER));
assertEquals("Relation", metadata.get(TikaCoreProperties.RELATION));
assertEquals("Journal of Thinkology",
metadata.get(TikaCoreProperties.SOURCE));
+ String[] subjects = metadata.getValues(TikaCoreProperties.SUBJECT);
+ assertEquals("THOUGHTS", subjects[0]);
+ assertEquals("HAPPINESS", subjects[1]);
+ assertEquals("FEAR", subjects[2]);
+ assertEquals("ANGER", subjects[3]);
+ assertEquals("DESPAIR", subjects[4]);
}
private Metadata extract(String xmpFileName) throws IOException,
TikaException, SAXException {