Repository: tika Updated Branches: refs/heads/2.x 35d1b2ad0 -> dc4ca999c
TIKA-1894 - Add XMPMM support to PDFParser and JpegParser via Jempbox Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/dc4ca999 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/dc4ca999 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/dc4ca999 Branch: refs/heads/2.x Commit: dc4ca999c2855814158868af97e877cbcc74079a Parents: 35d1b2a Author: tballison <[email protected]> Authored: Mon Mar 7 13:09:47 2016 -0500 Committer: tballison <[email protected]> Committed: Mon Mar 7 13:09:47 2016 -0500 ---------------------------------------------------------------------- CHANGES.txt | 5 +- .../java/org/apache/tika/metadata/XMPMM.java | 44 +++++ .../tika-parser-multimedia-bundle/pom.xml | 1 + .../tika-parser-pdf-bundle/pom.xml | 1 + tika-parser-modules/pom.xml | 5 +- .../tika-parser-multimedia-module/pom.xml | 10 +- .../apache/tika/parser/image/TiffParser.java | 2 +- .../tika/parser/image/xmp/JempboxExtractor.java | 97 ---------- .../tika/parser/image/xmp/XMPPacketScanner.java | 113 ----------- .../org/apache/tika/parser/jpeg/JpegParser.java | 2 +- .../parser/image/xmp/JempboxExtractorTest.java | 107 ----------- .../apache/tika/parser/jpeg/JpegParserTest.java | 16 ++ .../tika-parser-pdf-module/pom.xml | 5 + .../org/apache/tika/parser/pdf/PDFParser.java | 15 +- .../apache/tika/parser/pdf/PDFParserTest.java | 57 ++++++ .../tika-parser-xmp-module/pom.xml | 52 ++++++ .../tika/module/xmp/internal/Activator.java | 36 ++++ .../tika/parser/xmp/JempboxExtractor.java | 187 +++++++++++++++++++ .../tika/parser/xmp/XMPPacketScanner.java | 113 +++++++++++ .../tika/parser/xmp/JempboxExtractorTest.java | 107 +++++++++++ 20 files changed, 643 insertions(+), 332 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index e9d696d..d4611f0 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -9,6 +9,9 @@ Release 2.0 - Future Development * (Something about more specific parser bundles, plus an overall one) Release 1.13 - ??? + + * Add XMPMM support to PDFParser and JpegParser via Jempbox (TIKA-1894). + * Move serialization of TikaConfig to tika-core and enable dumping of the config file via tika-app (TIKA-1657). @@ -16,7 +19,7 @@ Release 1.13 - ??? * Upgrade to sqlite-jdbc 3.8.11.2 (TIKA-1861). NOTE: this dependency is still <scope>provided</scope>. You need to include this dependency - in order to parser sqlite files. + in order to parse sqlite files. * Upgrade to POI 3.14-beta1 (TIKA-1799). http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java index 3fc4dfa..1a5ef6d 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java @@ -72,4 +72,48 @@ public interface XMPMM { Property RENDITION_PARAMS = Property.externalText( PREFIX_ + "RenditionParams"); + /** + * Instance id in the XMPMM's history section + */ + Property HISTORY_EVENT_INSTANCEID = Property.externalTextBag( + PREFIX_+"History:InstanceID" + ); + + /** + * Action in the XMPMM's history section + */ + Property HISTORY_ACTION = Property.externalTextBag( + PREFIX_+"History:Action" + ); + /** + * When the action occurred in the XMPMM's history section + */ + Property HISTORY_WHEN = Property.externalTextBag( + PREFIX_+"History:When" + ); + + /** + * Software agent that created the action in the XMPMM's + * history section + */ + Property HISTORY_SOFTWARE_AGENT = Property.externalTextBag( + PREFIX_+"History:SoftwareAgent" + ); + + /** + * Document id for the document that this document + * was derived from + */ + Property DERIVED_FROM_DOCUMENTID = Property.externalText( + PREFIX_+"DerivedFrom:DocumentID" + ); + + /** + * Instance id for the document instance that this + * document was derived from + */ + Property DERIVED_FROM_INSTANCEID = Property.externalText( + PREFIX_+"DerivedFrom:InstanceID" + ); + } http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml b/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml index 85e09f8..7b528bc 100644 --- a/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml +++ b/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml @@ -45,6 +45,7 @@ com.sun.xml.internal.bind.marshaller</_runsystempackages> <Embed-Dependency> tika-parser-multimedia-module;inline=true, + tika-parser-xmp-module;inline=true, metadata-extractor;inline=true, xmpcore;inline=true, commons-codec;inline=true, http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml index 08cd863..27773a8 100644 --- a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml +++ b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml @@ -47,6 +47,7 @@ <Bundle-Activator>org.apache.tika.module.pdf.internal.Activator</Bundle-Activator> <Embed-Dependency> tika-parser-pdf-module;inline=true, + tika-parser-xmp-module;inline=true, commons-io;inline=true, pdfbox;inline=true, bcmail-jdk15on;inline=true, http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/pom.xml b/tika-parser-modules/pom.xml index ce5edd3..8a3435a 100644 --- a/tika-parser-modules/pom.xml +++ b/tika-parser-modules/pom.xml @@ -56,6 +56,7 @@ <module>tika-parser-scientific-module</module> <module>tika-parser-text-module</module> <module>tika-parser-web-module</module> + <module>tika-parser-xmp-module</module> </modules> <dependencies> @@ -72,7 +73,7 @@ <optional>true</optional> </dependency> <!-- Test dependencies --> - <dependency> + <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-core</artifactId> <version>${project.version}</version> @@ -86,7 +87,7 @@ <type>test-jar</type> <scope>test</scope> </dependency> - <dependency> + <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <scope>test</scope> http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-multimedia-module/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/pom.xml b/tika-parser-modules/tika-parser-multimedia-module/pom.xml index f15f3bd..63ea5aa 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/pom.xml +++ b/tika-parser-modules/tika-parser-multimedia-module/pom.xml @@ -36,6 +36,11 @@ <version>${project.version}</version> </dependency> <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-parser-xmp-module</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> <groupId>com.drewnoakes</groupId> <artifactId>metadata-extractor</artifactId> <version>${metadata.extractor.version}</version> @@ -82,11 +87,6 @@ </dependency> <dependency> <groupId>org.apache.pdfbox</groupId> - <artifactId>jempbox</artifactId> - <version>${pdfbox.version}</version> - </dependency> - <dependency> - <groupId>org.apache.pdfbox</groupId> <artifactId>fontbox</artifactId> <version>${pdfbox.version}</version> </dependency> http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java index 3be436b..c98ce69 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java @@ -28,7 +28,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.image.xmp.JempboxExtractor; +import org.apache.tika.parser.xmp.JempboxExtractor; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java deleted file mode 100644 index 20d3db5..0000000 --- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.image.xmp; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; -import java.util.List; - -import org.apache.jempbox.xmp.XMPMetadata; -import org.apache.jempbox.xmp.XMPSchemaDublinCore; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.xml.sax.InputSource; - -import static java.nio.charset.StandardCharsets.UTF_8; - -public class JempboxExtractor { - - // The XMP spec says it must be unicode, but for most file formats it specifies "must be encoded in UTF-8" - private static final String DEFAULT_XMP_CHARSET = UTF_8.name(); - private XMPPacketScanner scanner = new XMPPacketScanner(); - private Metadata metadata; - - public JempboxExtractor(Metadata metadata) { - this.metadata = metadata; - } - - public void parse(InputStream file) throws IOException, TikaException { - ByteArrayOutputStream xmpraw = new ByteArrayOutputStream(); - if (!scanner.parse(file, xmpraw)) { - return; - } - - Reader decoded = new InputStreamReader( - new ByteArrayInputStream(xmpraw.toByteArray()), - DEFAULT_XMP_CHARSET); - try { - XMPMetadata xmp = XMPMetadata.load(new InputSource(decoded)); - XMPSchemaDublinCore dc = xmp.getDublinCoreSchema(); - if (dc != null) { - if (dc.getTitle() != null) { - metadata.set(TikaCoreProperties.TITLE, dc.getTitle()); - } - if (dc.getDescription() != null) { - metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription()); - } - if (dc.getCreators() != null && dc.getCreators().size() > 0) { - metadata.set(TikaCoreProperties.CREATOR, joinCreators(dc.getCreators())); - } - if (dc.getSubjects() != null && dc.getSubjects().size() > 0) { - for (String keyword : dc.getSubjects()) { - metadata.add(TikaCoreProperties.KEYWORDS, keyword); - } - // TODO should we set KEYWORDS too? - // All tested photo managers set the same in Iptc.Application2.Keywords and Xmp.dc.subject - } - } - } catch (IOException e) { - // Could not parse embedded XMP metadata. That's not a serious - // problem, so we'll just ignore the issue for now. - // TODO: Make error handling like this configurable. - } - } - - protected String joinCreators(List<String> creators) { - if (creators == null || creators.size() == 0) { - return ""; - } - if (creators.size() == 1) { - return creators.get(0); - } - StringBuffer c = new StringBuffer(); - for (String s : creators) { - c.append(", ").append(s); - } - return c.substring(2); - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java deleted file mode 100644 index d4fa4bc..0000000 --- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* $Id: XMPPacketParser.java 750418 2009-03-05 11:03:54Z vhennebert $ */ - -package org.apache.tika.parser.image.xmp; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; - -import static java.nio.charset.StandardCharsets.US_ASCII; - -/** - * This class is a parser for XMP packets. By default, it tries to locate the first XMP packet - * it finds and parses it. - * <p/> - * Important: Before you use this class to look for an XMP packet in some random file, please read - * the chapter on "Scanning Files for XMP Packets" in the XMP specification! - * <p/> - * Thic class was branched from http://xmlgraphics.apache.org/ XMPPacketParser. - * See also org.semanticdesktop.aperture.extractor.xmp.XMPExtractor, a variant. - */ -public class XMPPacketScanner { - - private static final byte[] PACKET_HEADER; - private static final byte[] PACKET_HEADER_END; - private static final byte[] PACKET_TRAILER; - - static { - PACKET_HEADER = "<?xpacket begin=".getBytes(US_ASCII); - PACKET_HEADER_END = "?>".getBytes(US_ASCII); - PACKET_TRAILER = "<?xpacket".getBytes(US_ASCII); - } - - private static boolean skipAfter(InputStream in, byte[] match) throws IOException { - return skipAfter(in, match, null); - } - - private static boolean skipAfter(InputStream in, byte[] match, OutputStream out) - throws IOException { - int found = 0; - int len = match.length; - int b; - while ((b = in.read()) >= 0) { - if (b == match[found]) { - found++; - if (found == len) { - return true; - } - } else { - if (out != null) { - if (found > 0) { - out.write(match, 0, found); - } - out.write(b); - } - found = 0; - } - } - return false; - } - - /** - * Locates an XMP packet in a stream, parses it and returns the XMP metadata. If no - * XMP packet is found until the stream ends, null is returned. Note: This method - * only finds the first XMP packet in a stream. And it cannot determine whether it - * has found the right XMP packet if there are multiple packets. - * <p/> - * Does <em>not</em> close the stream. - * If XMP block was found reading can continue below the block. - * - * @param in the InputStream to search - * @param xmlOut to write the XMP packet to - * @return true if XMP packet is found, false otherwise - * @throws IOException if an I/O error occurs - * @throws TransformerException if an error occurs while parsing the XMP packet - */ - public boolean parse(InputStream in, OutputStream xmlOut) throws IOException { - if (!in.markSupported()) { - in = new java.io.BufferedInputStream(in); - } - boolean foundXMP = skipAfter(in, PACKET_HEADER); - if (!foundXMP) { - return false; - } - //TODO Inspect "begin" attribute! - if (!skipAfter(in, PACKET_HEADER_END)) { - throw new IOException("Invalid XMP packet header!"); - } - //TODO Do with TeeInputStream when Commons IO 1.4 is available - if (!skipAfter(in, PACKET_TRAILER, xmlOut)) { - throw new IOException("XMP packet not properly terminated!"); - } - return true; - } - -} - http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java index d13cd62..247194e 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java @@ -29,7 +29,7 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.image.ImageMetadataExtractor; -import org.apache.tika.parser.image.xmp.JempboxExtractor; +import org.apache.tika.parser.xmp.JempboxExtractor; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java deleted file mode 100644 index e389f17..0000000 --- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.image.xmp; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.io.IOException; -import java.io.InputStream; -import java.util.Arrays; -import java.util.Collection; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.junit.Test; - -public class JempboxExtractorTest { - - @Test - public void testParseJpeg() throws IOException, TikaException { - Metadata metadata = new Metadata(); - InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg"); - // set some values before extraction to see that they are overridden - metadata.set(TikaCoreProperties.TITLE, "old title"); - metadata.set(TikaCoreProperties.DESCRIPTION, "old description"); - metadata.set(TikaCoreProperties.CREATOR, "previous author"); - // ... or kept in case the field is multi-value - metadata.add(TikaCoreProperties.KEYWORDS, "oldkeyword"); - - JempboxExtractor extractor = new JempboxExtractor(metadata); - extractor.parse(stream); - - // DublinCore fields - assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); - assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR)); - Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); - assertTrue(keywords.contains("oldkeyword")); - assertTrue(keywords.contains("grazelands")); - assertTrue(keywords.contains("nature reserve")); - assertTrue(keywords.contains("bird watching")); - assertTrue(keywords.contains("coast")); - Collection<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT)); - assertTrue(subject.contains("oldkeyword")); - assertTrue(subject.contains("grazelands")); - assertTrue(subject.contains("nature reserve")); - assertTrue(subject.contains("bird watching")); - assertTrue(subject.contains("coast")); - } - - @Test - public void testParseJpegPhotoshop() throws IOException, TikaException { - Metadata metadata = new Metadata(); - InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg"); - - JempboxExtractor extractor = new JempboxExtractor(metadata); - extractor.parse(stream); - - // DublinCore fields - assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); - assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR)); - Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); - assertTrue(keywords.contains("bird watching")); - assertTrue(keywords.contains("coast")); - } - - @Test - public void testParseJpegXnviewmp() throws IOException, TikaException { - Metadata metadata = new Metadata(); - InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg"); - - JempboxExtractor extractor = new JempboxExtractor(metadata); - extractor.parse(stream); - - // XnViewMp fields not understood by Jempbox - assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); - Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); - assertTrue(keywords.contains("coast")); - assertTrue(keywords.contains("nature reserve")); - } - - @Test - public void testJoinCreators() { - assertEquals("Mr B", new JempboxExtractor(null).joinCreators( - Arrays.asList("Mr B"))); - // TODO use multi-value property instead? - assertEquals("Mr B, Mr A", new JempboxExtractor(null).joinCreators( - Arrays.asList("Mr B", "Mr A"))); - } - -} http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java index f187545..1f08476 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java @@ -27,6 +27,7 @@ import java.util.List; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TIFF; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.XMPMM; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.junit.Test; @@ -247,4 +248,19 @@ public class JpegParserTest { assertEquals("300.0", metadata.get(TIFF.RESOLUTION_HORIZONTAL)); assertEquals("300.0", metadata.get(TIFF.RESOLUTION_VERTICAL)); } + + @Test + public void testJPEGXMPMM() throws Exception { + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); + InputStream stream = + getClass().getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg"); + parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); + + //TODO: when jempbox is fixed/xmpbox is used + //add tests for history...currently not extracted + assertEquals("xmp.did:49E997348D4911E1AB62EBF9B374B234", + metadata.get(XMPMM.DOCUMENTID)); + } + } http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-pdf-module/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/pom.xml b/tika-parser-modules/tika-parser-pdf-module/pom.xml index dfe2f0a..a706ff3 100644 --- a/tika-parser-modules/tika-parser-pdf-module/pom.xml +++ b/tika-parser-modules/tika-parser-pdf-module/pom.xml @@ -34,6 +34,11 @@ <version>${project.version}</version> </dependency> <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-parser-xmp-module</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>${commons.io.version}</version> http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 65f0b9c..6fe0396 100644 --- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -58,6 +58,7 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.parser.xmp.JempboxExtractor; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -223,19 +224,23 @@ public class PDFParser extends AbstractParser { Boolean.toString(ap.canPrintDegraded())); - //now go for the XMP stuff + //now go for the XMP org.apache.jempbox.xmp.XMPMetadata xmp = null; XMPSchemaDublinCore dcSchema = null; try { if (document.getDocumentCatalog().getMetadata() != null) { xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata(); } - if (xmp != null) { + } catch (IOException e) {} + + if (xmp != null) { + try { dcSchema = xmp.getDublinCoreSchema(); - } - } catch (IOException e) { - //swallow + } catch (IOException e) {} + + JempboxExtractor.extractXMPMM(xmp, metadata); } + PDDocumentInformation info = document.getDocumentInformation(); metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 8005c5b..a8bfaed 100644 --- a/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -16,6 +16,7 @@ */ package org.apache.tika.parser.pdf; +import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNull; @@ -31,6 +32,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; + import org.apache.commons.io.IOUtils; import org.apache.log4j.Level; import org.apache.log4j.Logger; @@ -45,6 +47,7 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.OfficeOpenXMLCore; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.XMPMM; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; @@ -1357,6 +1360,60 @@ public class PDFParserTest extends TikaTest { assertNotContained("Mount Rushmore National Memorial", xml); } + @Test + public void testXMPMM() throws Exception { +// XMLResult r = getXML("testPDF_Version.11.x.PDFA-1b.pdf"); + Metadata m = getXML("testPDF_twoAuthors.pdf").metadata; + assertEquals("uuid:0e46913c-72b9-40c0-8232-69e362abcd1e", + m.get(XMPMM.DOCUMENTID)); + + m = getXML("testPDF_Version.11.x.PDFA-1b.pdf").metadata; + assertEquals("uuid:cccee1fc-51b3-4b52-ac86-672af3974d25", + m.get(XMPMM.DOCUMENTID)); + + //now test for 7 elements in each parallel array + //from the history section + assertArrayEquals(new String[]{ + "uuid:0313504b-a0b0-4dac-a9f0-357221f2eadf", + "uuid:edc4279e-0d5f-465e-b13e-1298402fd11c", + "uuid:f565b775-43f3-4a9a-8541-e98c4115db6d", + "uuid:9fd5e0a8-14a5-4920-ad7f-870c0b8ee65f", + "uuid:09b6cfba-efde-4e07-a77f-70de858cc0aa", + "uuid:1e4ffbd7-dabc-4aae-801c-15b3404ade36", + "uuid:c1669773-a6ca-4bdd-aade-519030d0af00" + }, m.getValues(XMPMM.HISTORY_EVENT_INSTANCEID)); + + assertArrayEquals(new String[]{ + "converted", + "converted", + "converted", + "converted", + "converted", + "converted", + "converted" + }, m.getValues(XMPMM.HISTORY_ACTION)); + + assertArrayEquals(new String[]{ + "Preflight", + "Preflight", + "Preflight", + "Preflight", + "Preflight", + "Preflight", + "Preflight" + }, m.getValues(XMPMM.HISTORY_SOFTWARE_AGENT)); + + assertArrayEquals(new String[]{ + "2014-03-04T23:50:41Z", + "2014-03-04T23:50:42Z", + "2014-03-04T23:51:34Z", + "2014-03-04T23:51:36Z", + "2014-03-04T23:51:37Z", + "2014-03-04T23:52:22Z", + "2014-03-04T23:54:48Z" + }, m.getValues(XMPMM.HISTORY_WHEN)); + } + private void assertException(String path, Parser parser, ParseContext context, Class expected) { boolean noEx = false; InputStream is = getResourceAsStream(path); http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-xmp-module/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-xmp-module/pom.xml b/tika-parser-modules/tika-parser-xmp-module/pom.xml new file mode 100644 index 0000000..2101075 --- /dev/null +++ b/tika-parser-modules/tika-parser-xmp-module/pom.xml @@ -0,0 +1,52 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + you under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-modules</artifactId> + <version>2.0-SNAPSHOT</version> + </parent> + + <artifactId>tika-parser-xmp-module</artifactId> + <name>Apache Tika parser xmp module</name> + <url>http://tika.apache.org/</url> + + <properties> + <mime4j.version>0.7.2</mime4j.version> + </properties> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>org.apache.pdfbox</groupId> + <artifactId>jempbox</artifactId> + <version>${pdfbox.version}</version> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/module/xmp/internal/Activator.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/module/xmp/internal/Activator.java b/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/module/xmp/internal/Activator.java new file mode 100644 index 0000000..4161c6e --- /dev/null +++ b/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/module/xmp/internal/Activator.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.module.xmp.internal; + +import org.apache.tika.osgi.TikaAbstractBundleActivator; +import org.osgi.framework.BundleContext; + +public class Activator extends TikaAbstractBundleActivator { + + @Override + public void start(BundleContext context) throws Exception { + + registerTikaParserServiceLoader(context, Activator.class.getClassLoader()); + + } + + @Override + public void stop(BundleContext context) throws Exception { + + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java b/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java new file mode 100644 index 0000000..aa72896 --- /dev/null +++ b/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xmp; + +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.Calendar; +import java.util.List; + +import org.apache.jempbox.xmp.ResourceEvent; +import org.apache.jempbox.xmp.ResourceRef; +import org.apache.jempbox.xmp.XMPMetadata; +import org.apache.jempbox.xmp.XMPSchemaDublinCore; +import org.apache.jempbox.xmp.XMPSchemaMediaManagement; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.XMPMM; +import org.apache.tika.utils.DateUtils; +import org.xml.sax.InputSource; + +public class JempboxExtractor { + + // The XMP spec says it must be unicode, but for most file formats it specifies "must be encoded in UTF-8" + private static final String DEFAULT_XMP_CHARSET = UTF_8.name(); + private XMPPacketScanner scanner = new XMPPacketScanner(); + private Metadata metadata; + + public JempboxExtractor(Metadata metadata) { + this.metadata = metadata; + } + + public void parse(InputStream file) throws IOException, TikaException { + ByteArrayOutputStream xmpraw = new ByteArrayOutputStream(); + if (!scanner.parse(file, xmpraw)) { + return; + } + + Reader decoded = new InputStreamReader( + new ByteArrayInputStream(xmpraw.toByteArray()), + DEFAULT_XMP_CHARSET); + XMPMetadata xmp = null; + try { + xmp = XMPMetadata.load(new InputSource(decoded)); + } catch (IOException e) { + // + } + + if (xmp == null) { + return; + } + XMPSchemaDublinCore dc = null; + try { + dc = xmp.getDublinCoreSchema(); + } catch (IOException e) { + } + + if (dc != null) { + if (dc.getTitle() != null) { + metadata.set(TikaCoreProperties.TITLE, dc.getTitle()); + } + if (dc.getDescription() != null) { + metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription()); + } + if (dc.getCreators() != null && dc.getCreators().size() > 0) { + metadata.set(TikaCoreProperties.CREATOR, joinCreators(dc.getCreators())); + } + if (dc.getSubjects() != null && dc.getSubjects().size() > 0) { + for (String keyword : dc.getSubjects()) { + metadata.add(TikaCoreProperties.KEYWORDS, keyword); + } + // TODO should we set KEYWORDS too? + // All tested photo managers set the same in Iptc.Application2.Keywords and Xmp.dc.subject + } + } + extractXMPMM(xmp, metadata); + } + + protected String joinCreators(List<String> creators) { + if (creators == null || creators.size() == 0) { + return ""; + } + if (creators.size() == 1) { + return creators.get(0); + } + StringBuffer c = new StringBuffer(); + for (String s : creators) { + c.append(", ").append(s); + } + return c.substring(2); + } + + /** + * Extracts Media Management metadata from XMP. + * + * Silently swallows exceptions. + * @param xmp + * @param metadata + */ + public static void extractXMPMM(XMPMetadata xmp, Metadata metadata) { + XMPSchemaMediaManagement mmSchema = null; + try { + mmSchema = xmp.getMediaManagementSchema(); + } catch (IOException e) { + //swallow + return; + } + if (mmSchema != null) { + addMetadata(metadata, XMPMM.DOCUMENTID, mmSchema.getDocumentID()); + //not currently supported by JempBox... +// metadata.set(XMPMM.INSTANCEID, mmSchema.getInstanceID()); + + ResourceRef derivedFrom = mmSchema.getDerivedFrom(); + if (derivedFrom != null) { + try { + addMetadata(metadata, XMPMM.DERIVED_FROM_DOCUMENTID, derivedFrom.getDocumentID()); + } catch (NullPointerException e) {} + + try { + addMetadata(metadata, XMPMM.DERIVED_FROM_INSTANCEID, derivedFrom.getInstanceID()); + } catch (NullPointerException e) {} + + //TODO: not yet supported by XMPBox...extract OriginalDocumentID + //in DerivedFrom section + } + if (mmSchema.getHistory() != null) { + for (ResourceEvent stevt : mmSchema.getHistory()) { + String instanceId = null; + String action = null; + Calendar when = null; + String softwareAgent = null; + try { + instanceId = stevt.getInstanceID(); + action = stevt.getAction(); + when = stevt.getWhen(); + softwareAgent = stevt.getSoftwareAgent(); + + //instanceid can throw npe; getWhen can throw IOException + } catch (NullPointerException|IOException e) { + //swallow + } + if (instanceId != null && instanceId.trim().length() > 0) { + //for absent data elements, pass in empty strings so + //that parallel arrays will have matching offsets + //for absent data + + action = (action == null) ? "" : action; + String dateString = (when == null) ? "" : DateUtils.formatDate(when); + softwareAgent = (softwareAgent == null) ? "" : softwareAgent; + + metadata.add(XMPMM.HISTORY_EVENT_INSTANCEID, instanceId); + metadata.add(XMPMM.HISTORY_ACTION, action); + metadata.add(XMPMM.HISTORY_WHEN, dateString); + metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT, softwareAgent); + } + } + } + } + } + + private static void addMetadata(Metadata m, Property p, String value) { + if (value != null) { + m.add(p, value); + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java b/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java new file mode 100644 index 0000000..70018cd --- /dev/null +++ b/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id: XMPPacketParser.java 750418 2009-03-05 11:03:54Z vhennebert $ */ + +package org.apache.tika.parser.xmp; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +import static java.nio.charset.StandardCharsets.US_ASCII; + +/** + * This class is a parser for XMP packets. By default, it tries to locate the first XMP packet + * it finds and parses it. + * <p/> + * Important: Before you use this class to look for an XMP packet in some random file, please read + * the chapter on "Scanning Files for XMP Packets" in the XMP specification! + * <p/> + * Thic class was branched from http://xmlgraphics.apache.org/ XMPPacketParser. + * See also org.semanticdesktop.aperture.extractor.xmp.XMPExtractor, a variant. + */ +public class XMPPacketScanner { + + private static final byte[] PACKET_HEADER; + private static final byte[] PACKET_HEADER_END; + private static final byte[] PACKET_TRAILER; + + static { + PACKET_HEADER = "<?xpacket begin=".getBytes(US_ASCII); + PACKET_HEADER_END = "?>".getBytes(US_ASCII); + PACKET_TRAILER = "<?xpacket".getBytes(US_ASCII); + } + + private static boolean skipAfter(InputStream in, byte[] match) throws IOException { + return skipAfter(in, match, null); + } + + private static boolean skipAfter(InputStream in, byte[] match, OutputStream out) + throws IOException { + int found = 0; + int len = match.length; + int b; + while ((b = in.read()) >= 0) { + if (b == match[found]) { + found++; + if (found == len) { + return true; + } + } else { + if (out != null) { + if (found > 0) { + out.write(match, 0, found); + } + out.write(b); + } + found = 0; + } + } + return false; + } + + /** + * Locates an XMP packet in a stream, parses it and returns the XMP metadata. If no + * XMP packet is found until the stream ends, null is returned. Note: This method + * only finds the first XMP packet in a stream. And it cannot determine whether it + * has found the right XMP packet if there are multiple packets. + * <p/> + * Does <em>not</em> close the stream. + * If XMP block was found reading can continue below the block. + * + * @param in the InputStream to search + * @param xmlOut to write the XMP packet to + * @return true if XMP packet is found, false otherwise + * @throws IOException if an I/O error occurs + * @throws TransformerException if an error occurs while parsing the XMP packet + */ + public boolean parse(InputStream in, OutputStream xmlOut) throws IOException { + if (!in.markSupported()) { + in = new java.io.BufferedInputStream(in); + } + boolean foundXMP = skipAfter(in, PACKET_HEADER); + if (!foundXMP) { + return false; + } + //TODO Inspect "begin" attribute! + if (!skipAfter(in, PACKET_HEADER_END)) { + throw new IOException("Invalid XMP packet header!"); + } + //TODO Do with TeeInputStream when Commons IO 1.4 is available + if (!skipAfter(in, PACKET_TRAILER, xmlOut)) { + throw new IOException("XMP packet not properly terminated!"); + } + return true; + } + +} + http://git-wip-us.apache.org/repos/asf/tika/blob/dc4ca999/tika-parser-modules/tika-parser-xmp-module/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-xmp-module/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java b/tika-parser-modules/tika-parser-xmp-module/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java new file mode 100644 index 0000000..849fd01 --- /dev/null +++ b/tika-parser-modules/tika-parser-xmp-module/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xmp; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collection; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.Test; + +public class JempboxExtractorTest { + + @Test + public void testParseJpeg() throws IOException, TikaException { + Metadata metadata = new Metadata(); + InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg"); + // set some values before extraction to see that they are overridden + metadata.set(TikaCoreProperties.TITLE, "old title"); + metadata.set(TikaCoreProperties.DESCRIPTION, "old description"); + metadata.set(TikaCoreProperties.CREATOR, "previous author"); + // ... or kept in case the field is multi-value + metadata.add(TikaCoreProperties.KEYWORDS, "oldkeyword"); + + JempboxExtractor extractor = new JempboxExtractor(metadata); + extractor.parse(stream); + + // DublinCore fields + assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR)); + Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); + assertTrue(keywords.contains("oldkeyword")); + assertTrue(keywords.contains("grazelands")); + assertTrue(keywords.contains("nature reserve")); + assertTrue(keywords.contains("bird watching")); + assertTrue(keywords.contains("coast")); + Collection<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT)); + assertTrue(subject.contains("oldkeyword")); + assertTrue(subject.contains("grazelands")); + assertTrue(subject.contains("nature reserve")); + assertTrue(subject.contains("bird watching")); + assertTrue(subject.contains("coast")); + } + + @Test + public void testParseJpegPhotoshop() throws IOException, TikaException { + Metadata metadata = new Metadata(); + InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg"); + + JempboxExtractor extractor = new JempboxExtractor(metadata); + extractor.parse(stream); + + // DublinCore fields + assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR)); + Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); + assertTrue(keywords.contains("bird watching")); + assertTrue(keywords.contains("coast")); + } + + @Test + public void testParseJpegXnviewmp() throws IOException, TikaException { + Metadata metadata = new Metadata(); + InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg"); + + JempboxExtractor extractor = new JempboxExtractor(metadata); + extractor.parse(stream); + + // XnViewMp fields not understood by Jempbox + assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); + Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); + assertTrue(keywords.contains("coast")); + assertTrue(keywords.contains("nature reserve")); + } + + @Test + public void testJoinCreators() { + assertEquals("Mr B", new JempboxExtractor(null).joinCreators( + Arrays.asList("Mr B"))); + // TODO use multi-value property instead? + assertEquals("Mr B, Mr A", new JempboxExtractor(null).joinCreators( + Arrays.asList("Mr B", "Mr A"))); + } + +}
