Repository: tika Updated Branches: refs/heads/master 74e71ebd8 -> c5d4ec6c5
TIKA-1894: Add XMPMM support to PDFParser and JpegParser via Jempbox Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/c5d4ec6c Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/c5d4ec6c Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/c5d4ec6c Branch: refs/heads/master Commit: c5d4ec6c50824a9a40fdd2b492bf7557d8d693f3 Parents: 74e71eb Author: tballison <[email protected]> Authored: Mon Mar 7 10:12:55 2016 -0500 Committer: tballison <[email protected]> Committed: Mon Mar 7 10:12:55 2016 -0500 ---------------------------------------------------------------------- CHANGES.txt | 4 +- .../java/org/apache/tika/metadata/XMPMM.java | 44 ++++++ .../tika/parser/image/xmp/JempboxExtractor.java | 138 +++++++++++++++---- .../org/apache/tika/parser/pdf/PDFParser.java | 17 ++- .../apache/tika/parser/jpeg/JpegParserTest.java | 16 +++ .../apache/tika/parser/pdf/PDFParserTest.java | 57 ++++++++ 6 files changed, 246 insertions(+), 30 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/c5d4ec6c/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index a451feb..91bc623 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,7 @@ Release 1.13 - ??? + * Add XMPMM support to PDFParser and JpegParser via Jempbox (TIKA-1894). + * Move serialization of TikaConfig to tika-core and enable dumping of the config file via tika-app (TIKA-1657). @@ -17,7 +19,7 @@ Release 1.13 - ??? * Upgrade to PDFBox 1.8.11 (TIKA-1830). * Upgrade to Jackson 2.7.1 (TIKA-1869). -i + * Upgrade to Apache SIS 0.6 (TIKA-1878). * RichTextContentHandler moved from the Server package to Core (TIKA-1870). http://git-wip-us.apache.org/repos/asf/tika/blob/c5d4ec6c/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java index 3fc4dfa..1a5ef6d 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java @@ -72,4 +72,48 @@ public interface XMPMM { Property RENDITION_PARAMS = Property.externalText( PREFIX_ + "RenditionParams"); + /** + * Instance id in the XMPMM's history section + */ + Property HISTORY_EVENT_INSTANCEID = Property.externalTextBag( + PREFIX_+"History:InstanceID" + ); + + /** + * Action in the XMPMM's history section + */ + Property HISTORY_ACTION = Property.externalTextBag( + PREFIX_+"History:Action" + ); + /** + * When the action occurred in the XMPMM's history section + */ + Property HISTORY_WHEN = Property.externalTextBag( + PREFIX_+"History:When" + ); + + /** + * Software agent that created the action in the XMPMM's + * history section + */ + Property HISTORY_SOFTWARE_AGENT = Property.externalTextBag( + PREFIX_+"History:SoftwareAgent" + ); + + /** + * Document id for the document that this document + * was derived from + */ + Property DERIVED_FROM_DOCUMENTID = Property.externalText( + PREFIX_+"DerivedFrom:DocumentID" + ); + + /** + * Instance id for the document instance that this + * document was derived from + */ + Property DERIVED_FROM_INSTANCEID = Property.externalText( + PREFIX_+"DerivedFrom:InstanceID" + ); + } http://git-wip-us.apache.org/repos/asf/tika/blob/c5d4ec6c/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java index 10692b8..cd18907 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java @@ -16,23 +16,30 @@ */ package org.apache.tika.parser.image.xmp; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; +import java.util.Calendar; import java.util.List; +import org.apache.jempbox.xmp.ResourceEvent; +import org.apache.jempbox.xmp.ResourceRef; import org.apache.jempbox.xmp.XMPMetadata; import org.apache.jempbox.xmp.XMPSchemaDublinCore; +import org.apache.jempbox.xmp.XMPSchemaMediaManagement; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.XMPMM; +import org.apache.tika.utils.DateUtils; import org.xml.sax.InputSource; -import static java.nio.charset.StandardCharsets.UTF_8; - public class JempboxExtractor { // The XMP spec says it must be unicode, but for most file formats it specifies "must be encoded in UTF-8" @@ -53,32 +60,41 @@ public class JempboxExtractor { Reader decoded = new InputStreamReader( new ByteArrayInputStream(xmpraw.toByteArray()), DEFAULT_XMP_CHARSET); + XMPMetadata xmp = null; try { - XMPMetadata xmp = XMPMetadata.load(new InputSource(decoded)); - XMPSchemaDublinCore dc = xmp.getDublinCoreSchema(); - if (dc != null) { - if (dc.getTitle() != null) { - metadata.set(TikaCoreProperties.TITLE, dc.getTitle()); - } - if (dc.getDescription() != null) { - metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription()); - } - if (dc.getCreators() != null && dc.getCreators().size() > 0) { - metadata.set(TikaCoreProperties.CREATOR, joinCreators(dc.getCreators())); - } - if (dc.getSubjects() != null && dc.getSubjects().size() > 0) { - for (String keyword : dc.getSubjects()) { - metadata.add(TikaCoreProperties.KEYWORDS, keyword); - } - // TODO should we set KEYWORDS too? - // All tested photo managers set the same in Iptc.Application2.Keywords and Xmp.dc.subject + xmp = XMPMetadata.load(new InputSource(decoded)); + } catch (IOException e) { + // + } + + if (xmp == null) { + return; + } + XMPSchemaDublinCore dc = null; + try { + dc = xmp.getDublinCoreSchema(); + } catch (IOException e) { + } + + if (dc != null) { + if (dc.getTitle() != null) { + metadata.set(TikaCoreProperties.TITLE, dc.getTitle()); + } + if (dc.getDescription() != null) { + metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription()); + } + if (dc.getCreators() != null && dc.getCreators().size() > 0) { + metadata.set(TikaCoreProperties.CREATOR, joinCreators(dc.getCreators())); + } + if (dc.getSubjects() != null && dc.getSubjects().size() > 0) { + for (String keyword : dc.getSubjects()) { + metadata.add(TikaCoreProperties.KEYWORDS, keyword); } + // TODO should we set KEYWORDS too? + // All tested photo managers set the same in Iptc.Application2.Keywords and Xmp.dc.subject } - } catch (IOException e) { - // Could not parse embedded XMP metadata. That's not a serious - // problem, so we'll just ignore the issue for now. - // TODO: Make error handling like this configurable. } + extractXMPMM(xmp, metadata); } protected String joinCreators(List<String> creators) { @@ -94,4 +110,78 @@ public class JempboxExtractor { } return c.substring(2); } + + /** + * Extracts Media Management metadata from XMP. + * + * Silently swallows exceptions. + * @param xmp + * @param metadata + */ + public static void extractXMPMM(XMPMetadata xmp, Metadata metadata) { + XMPSchemaMediaManagement mmSchema = null; + try { + mmSchema = xmp.getMediaManagementSchema(); + } catch (IOException e) { + //swallow + return; + } + if (mmSchema != null) { + addMetadata(metadata, XMPMM.DOCUMENTID, mmSchema.getDocumentID()); + //not currently supported by JempBox... +// metadata.set(XMPMM.INSTANCEID, mmSchema.getInstanceID()); + + ResourceRef derivedFrom = mmSchema.getDerivedFrom(); + if (derivedFrom != null) { + try { + addMetadata(metadata, XMPMM.DERIVED_FROM_DOCUMENTID, derivedFrom.getDocumentID()); + } catch (NullPointerException e) {} + + try { + addMetadata(metadata, XMPMM.DERIVED_FROM_INSTANCEID, derivedFrom.getInstanceID()); + } catch (NullPointerException e) {} + + //TODO: not yet supported by XMPBox...extract OriginalDocumentID + //in DerivedFrom section + } + if (mmSchema.getHistory() != null) { + for (ResourceEvent stevt : mmSchema.getHistory()) { + String instanceId = null; + String action = null; + Calendar when = null; + String softwareAgent = null; + try { + instanceId = stevt.getInstanceID(); + action = stevt.getAction(); + when = stevt.getWhen(); + softwareAgent = stevt.getSoftwareAgent(); + + //instanceid can throw npe; getWhen can throw IOException + } catch (NullPointerException|IOException e) { + //swallow + } + if (instanceId != null && instanceId.trim().length() > 0) { + //for absent data elements, pass in empty strings so + //that parallel arrays will have matching offsets + //for absent data + + action = (action == null) ? "" : action; + String dateString = (when == null) ? "" : DateUtils.formatDate(when); + softwareAgent = (softwareAgent == null) ? "" : softwareAgent; + + metadata.add(XMPMM.HISTORY_EVENT_INSTANCEID, instanceId); + metadata.add(XMPMM.HISTORY_ACTION, action); + metadata.add(XMPMM.HISTORY_WHEN, dateString); + metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT, softwareAgent); + } + } + } + } + } + + private static void addMetadata(Metadata m, Property p, String value) { + if (value != null) { + m.add(p, value); + } + } } http://git-wip-us.apache.org/repos/asf/tika/blob/c5d4ec6c/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 29ebddf..8cb1b98 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -30,6 +30,7 @@ import java.util.Set; import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.jempbox.xmp.XMPSchema; import org.apache.jempbox.xmp.XMPSchemaDublinCore; +import org.apache.jempbox.xmp.XMPSchemaMediaManagement; import org.apache.jempbox.xmp.pdfa.XMPSchemaPDFAId; import org.apache.pdfbox.cos.COSArray; import org.apache.pdfbox.cos.COSBase; @@ -58,6 +59,7 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.parser.image.xmp.JempboxExtractor; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -223,19 +225,24 @@ public class PDFParser extends AbstractParser { Boolean.toString(ap.canPrintDegraded())); - //now go for the XMP stuff + //now go for the XMP org.apache.jempbox.xmp.XMPMetadata xmp = null; XMPSchemaDublinCore dcSchema = null; + XMPSchemaMediaManagement mmSchema = null; try { if (document.getDocumentCatalog().getMetadata() != null) { xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata(); } - if (xmp != null) { + } catch (IOException e) {} + + if (xmp != null) { + try { dcSchema = xmp.getDublinCoreSchema(); - } - } catch (IOException e) { - //swallow + } catch (IOException e) {} + + JempboxExtractor.extractXMPMM(xmp, metadata); } + PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); http://git-wip-us.apache.org/repos/asf/tika/blob/c5d4ec6c/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java index fd7ee29..6c90680 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java @@ -27,6 +27,7 @@ import java.util.List; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TIFF; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.XMPMM; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.junit.Test; @@ -247,4 +248,19 @@ public class JpegParserTest { assertEquals("300.0", metadata.get(TIFF.RESOLUTION_HORIZONTAL)); assertEquals("300.0", metadata.get(TIFF.RESOLUTION_VERTICAL)); } + + @Test + public void testJPEGXMPMM() throws Exception { + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); + InputStream stream = + getClass().getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg"); + parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); + + //TODO: when jempbox is fixed/xmpbox is used + //add tests for history...currently not extracted + assertEquals("xmp.did:49E997348D4911E1AB62EBF9B374B234", + metadata.get(XMPMM.DOCUMENTID)); + } + } http://git-wip-us.apache.org/repos/asf/tika/blob/c5d4ec6c/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 04d9f2b..47f3e0a 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -16,6 +16,7 @@ */ package org.apache.tika.parser.pdf; +import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNull; @@ -31,6 +32,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; + import org.apache.commons.io.IOUtils; import org.apache.log4j.Level; import org.apache.log4j.Logger; @@ -45,6 +47,7 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.OfficeOpenXMLCore; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.XMPMM; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; @@ -1358,6 +1361,60 @@ public class PDFParserTest extends TikaTest { assertNotContained("Mount Rushmore National Memorial", xml); } + @Test + public void testXMPMM() throws Exception { +// XMLResult r = getXML("testPDF_Version.11.x.PDFA-1b.pdf"); + Metadata m = getXML("testPDF_twoAuthors.pdf").metadata; + assertEquals("uuid:0e46913c-72b9-40c0-8232-69e362abcd1e", + m.get(XMPMM.DOCUMENTID)); + + m = getXML("testPDF_Version.11.x.PDFA-1b.pdf").metadata; + assertEquals("uuid:cccee1fc-51b3-4b52-ac86-672af3974d25", + m.get(XMPMM.DOCUMENTID)); + + //now test for 7 elements in each parallel array + //from the history section + assertArrayEquals(new String[]{ + "uuid:0313504b-a0b0-4dac-a9f0-357221f2eadf", + "uuid:edc4279e-0d5f-465e-b13e-1298402fd11c", + "uuid:f565b775-43f3-4a9a-8541-e98c4115db6d", + "uuid:9fd5e0a8-14a5-4920-ad7f-870c0b8ee65f", + "uuid:09b6cfba-efde-4e07-a77f-70de858cc0aa", + "uuid:1e4ffbd7-dabc-4aae-801c-15b3404ade36", + "uuid:c1669773-a6ca-4bdd-aade-519030d0af00" + }, m.getValues(XMPMM.HISTORY_EVENT_INSTANCEID)); + + assertArrayEquals(new String[]{ + "converted", + "converted", + "converted", + "converted", + "converted", + "converted", + "converted" + }, m.getValues(XMPMM.HISTORY_ACTION)); + + assertArrayEquals(new String[]{ + "Preflight", + "Preflight", + "Preflight", + "Preflight", + "Preflight", + "Preflight", + "Preflight" + }, m.getValues(XMPMM.HISTORY_SOFTWARE_AGENT)); + + assertArrayEquals(new String[]{ + "2014-03-04T23:50:41Z", + "2014-03-04T23:50:42Z", + "2014-03-04T23:51:34Z", + "2014-03-04T23:51:36Z", + "2014-03-04T23:51:37Z", + "2014-03-04T23:52:22Z", + "2014-03-04T23:54:48Z" + }, m.getValues(XMPMM.HISTORY_WHEN)); + } + private void assertException(String path, Parser parser, ParseContext context, Class expected) { boolean noEx = false; InputStream is = getResourceAsStream(path);
