Repository: tika Updated Branches: refs/heads/2.x 5f413ffa7 -> c58af959b
TIKA-1894 -- clean up following recommendations from Ray Gauss and Bob Paulin. Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/c58af959 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/c58af959 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/c58af959 Branch: refs/heads/2.x Commit: c58af959b6cc3f3a3d8f555d53b147388e36b01d Parents: 5f413ff Author: tballison <[email protected]> Authored: Fri Mar 18 12:24:13 2016 -0400 Committer: tballison <[email protected]> Committed: Fri Mar 18 12:24:13 2016 -0400 ---------------------------------------------------------------------- .../tika-parser-multimedia-bundle/pom.xml | 2 +- .../tika-parser-pdf-bundle/pom.xml | 2 +- tika-parser-modules/pom.xml | 2 +- .../tika-parser-multimedia-module/pom.xml | 2 +- .../tika-parser-pdf-module/pom.xml | 2 +- .../tika-parser-xmp-commons/pom.xml | 48 +++++ .../tika/parser/xmp/JempboxExtractor.java | 187 +++++++++++++++++++ .../tika/parser/xmp/XMPPacketScanner.java | 113 +++++++++++ .../tika/parser/xmp/JempboxExtractorTest.java | 107 +++++++++++ .../tika-parser-xmp-module/pom.xml | 52 ------ .../tika/module/xmp/internal/Activator.java | 36 ---- .../tika/parser/xmp/JempboxExtractor.java | 187 ------------------- .../tika/parser/xmp/XMPPacketScanner.java | 113 ----------- .../tika/parser/xmp/JempboxExtractorTest.java | 107 ----------- 14 files changed, 460 insertions(+), 500 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml b/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml index 7b528bc..067d6f3 100644 --- a/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml +++ b/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml @@ -45,7 +45,7 @@ com.sun.xml.internal.bind.marshaller</_runsystempackages> <Embed-Dependency> tika-parser-multimedia-module;inline=true, - tika-parser-xmp-module;inline=true, + tika-parser-xmp-commons;inline=true, metadata-extractor;inline=true, xmpcore;inline=true, commons-codec;inline=true, http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml index 27773a8..771389a 100644 --- a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml +++ b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml @@ -47,7 +47,7 @@ <Bundle-Activator>org.apache.tika.module.pdf.internal.Activator</Bundle-Activator> <Embed-Dependency> tika-parser-pdf-module;inline=true, - tika-parser-xmp-module;inline=true, + tika-parser-xmp-commons;inline=true, commons-io;inline=true, pdfbox;inline=true, bcmail-jdk15on;inline=true, http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/pom.xml b/tika-parser-modules/pom.xml index 6342ebb..88205ca 100644 --- a/tika-parser-modules/pom.xml +++ b/tika-parser-modules/pom.xml @@ -56,7 +56,7 @@ <module>tika-parser-scientific-module</module> <module>tika-parser-text-module</module> <module>tika-parser-web-module</module> - <module>tika-parser-xmp-module</module> + <module>tika-parser-xmp-commons</module> </modules> <dependencies> http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-multimedia-module/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/pom.xml b/tika-parser-modules/tika-parser-multimedia-module/pom.xml index 63ea5aa..632ed86 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/pom.xml +++ b/tika-parser-modules/tika-parser-multimedia-module/pom.xml @@ -37,7 +37,7 @@ </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-parser-xmp-module</artifactId> + <artifactId>tika-parser-xmp-commons</artifactId> <version>${project.version}</version> </dependency> <dependency> http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-pdf-module/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/pom.xml b/tika-parser-modules/tika-parser-pdf-module/pom.xml index a706ff3..48f8eec 100644 --- a/tika-parser-modules/tika-parser-pdf-module/pom.xml +++ b/tika-parser-modules/tika-parser-pdf-module/pom.xml @@ -35,7 +35,7 @@ </dependency> <dependency> <groupId>${project.groupId}</groupId> - <artifactId>tika-parser-xmp-module</artifactId> + <artifactId>tika-parser-xmp-commons</artifactId> <version>${project.version}</version> </dependency> <dependency> http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-xmp-commons/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-xmp-commons/pom.xml b/tika-parser-modules/tika-parser-xmp-commons/pom.xml new file mode 100644 index 0000000..80729cc --- /dev/null +++ b/tika-parser-modules/tika-parser-xmp-commons/pom.xml @@ -0,0 +1,48 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + you under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-modules</artifactId> + <version>2.0-SNAPSHOT</version> + </parent> + + <artifactId>tika-parser-xmp-commons</artifactId> + <name>Apache Tika parser xmp commons</name> + <url>http://tika.apache.org/</url> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>org.apache.pdfbox</groupId> + <artifactId>jempbox</artifactId> + <version>${pdfbox.version}</version> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java b/tika-parser-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java new file mode 100644 index 0000000..aa72896 --- /dev/null +++ b/tika-parser-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xmp; + +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.Calendar; +import java.util.List; + +import org.apache.jempbox.xmp.ResourceEvent; +import org.apache.jempbox.xmp.ResourceRef; +import org.apache.jempbox.xmp.XMPMetadata; +import org.apache.jempbox.xmp.XMPSchemaDublinCore; +import org.apache.jempbox.xmp.XMPSchemaMediaManagement; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.XMPMM; +import org.apache.tika.utils.DateUtils; +import org.xml.sax.InputSource; + +public class JempboxExtractor { + + // The XMP spec says it must be unicode, but for most file formats it specifies "must be encoded in UTF-8" + private static final String DEFAULT_XMP_CHARSET = UTF_8.name(); + private XMPPacketScanner scanner = new XMPPacketScanner(); + private Metadata metadata; + + public JempboxExtractor(Metadata metadata) { + this.metadata = metadata; + } + + public void parse(InputStream file) throws IOException, TikaException { + ByteArrayOutputStream xmpraw = new ByteArrayOutputStream(); + if (!scanner.parse(file, xmpraw)) { + return; + } + + Reader decoded = new InputStreamReader( + new ByteArrayInputStream(xmpraw.toByteArray()), + DEFAULT_XMP_CHARSET); + XMPMetadata xmp = null; + try { + xmp = XMPMetadata.load(new InputSource(decoded)); + } catch (IOException e) { + // + } + + if (xmp == null) { + return; + } + XMPSchemaDublinCore dc = null; + try { + dc = xmp.getDublinCoreSchema(); + } catch (IOException e) { + } + + if (dc != null) { + if (dc.getTitle() != null) { + metadata.set(TikaCoreProperties.TITLE, dc.getTitle()); + } + if (dc.getDescription() != null) { + metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription()); + } + if (dc.getCreators() != null && dc.getCreators().size() > 0) { + metadata.set(TikaCoreProperties.CREATOR, joinCreators(dc.getCreators())); + } + if (dc.getSubjects() != null && dc.getSubjects().size() > 0) { + for (String keyword : dc.getSubjects()) { + metadata.add(TikaCoreProperties.KEYWORDS, keyword); + } + // TODO should we set KEYWORDS too? + // All tested photo managers set the same in Iptc.Application2.Keywords and Xmp.dc.subject + } + } + extractXMPMM(xmp, metadata); + } + + protected String joinCreators(List<String> creators) { + if (creators == null || creators.size() == 0) { + return ""; + } + if (creators.size() == 1) { + return creators.get(0); + } + StringBuffer c = new StringBuffer(); + for (String s : creators) { + c.append(", ").append(s); + } + return c.substring(2); + } + + /** + * Extracts Media Management metadata from XMP. + * + * Silently swallows exceptions. + * @param xmp + * @param metadata + */ + public static void extractXMPMM(XMPMetadata xmp, Metadata metadata) { + XMPSchemaMediaManagement mmSchema = null; + try { + mmSchema = xmp.getMediaManagementSchema(); + } catch (IOException e) { + //swallow + return; + } + if (mmSchema != null) { + addMetadata(metadata, XMPMM.DOCUMENTID, mmSchema.getDocumentID()); + //not currently supported by JempBox... +// metadata.set(XMPMM.INSTANCEID, mmSchema.getInstanceID()); + + ResourceRef derivedFrom = mmSchema.getDerivedFrom(); + if (derivedFrom != null) { + try { + addMetadata(metadata, XMPMM.DERIVED_FROM_DOCUMENTID, derivedFrom.getDocumentID()); + } catch (NullPointerException e) {} + + try { + addMetadata(metadata, XMPMM.DERIVED_FROM_INSTANCEID, derivedFrom.getInstanceID()); + } catch (NullPointerException e) {} + + //TODO: not yet supported by XMPBox...extract OriginalDocumentID + //in DerivedFrom section + } + if (mmSchema.getHistory() != null) { + for (ResourceEvent stevt : mmSchema.getHistory()) { + String instanceId = null; + String action = null; + Calendar when = null; + String softwareAgent = null; + try { + instanceId = stevt.getInstanceID(); + action = stevt.getAction(); + when = stevt.getWhen(); + softwareAgent = stevt.getSoftwareAgent(); + + //instanceid can throw npe; getWhen can throw IOException + } catch (NullPointerException|IOException e) { + //swallow + } + if (instanceId != null && instanceId.trim().length() > 0) { + //for absent data elements, pass in empty strings so + //that parallel arrays will have matching offsets + //for absent data + + action = (action == null) ? "" : action; + String dateString = (when == null) ? "" : DateUtils.formatDate(when); + softwareAgent = (softwareAgent == null) ? "" : softwareAgent; + + metadata.add(XMPMM.HISTORY_EVENT_INSTANCEID, instanceId); + metadata.add(XMPMM.HISTORY_ACTION, action); + metadata.add(XMPMM.HISTORY_WHEN, dateString); + metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT, softwareAgent); + } + } + } + } + } + + private static void addMetadata(Metadata m, Property p, String value) { + if (value != null) { + m.add(p, value); + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java b/tika-parser-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java new file mode 100644 index 0000000..70018cd --- /dev/null +++ b/tika-parser-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id: XMPPacketParser.java 750418 2009-03-05 11:03:54Z vhennebert $ */ + +package org.apache.tika.parser.xmp; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +import static java.nio.charset.StandardCharsets.US_ASCII; + +/** + * This class is a parser for XMP packets. By default, it tries to locate the first XMP packet + * it finds and parses it. + * <p/> + * Important: Before you use this class to look for an XMP packet in some random file, please read + * the chapter on "Scanning Files for XMP Packets" in the XMP specification! + * <p/> + * Thic class was branched from http://xmlgraphics.apache.org/ XMPPacketParser. + * See also org.semanticdesktop.aperture.extractor.xmp.XMPExtractor, a variant. + */ +public class XMPPacketScanner { + + private static final byte[] PACKET_HEADER; + private static final byte[] PACKET_HEADER_END; + private static final byte[] PACKET_TRAILER; + + static { + PACKET_HEADER = "<?xpacket begin=".getBytes(US_ASCII); + PACKET_HEADER_END = "?>".getBytes(US_ASCII); + PACKET_TRAILER = "<?xpacket".getBytes(US_ASCII); + } + + private static boolean skipAfter(InputStream in, byte[] match) throws IOException { + return skipAfter(in, match, null); + } + + private static boolean skipAfter(InputStream in, byte[] match, OutputStream out) + throws IOException { + int found = 0; + int len = match.length; + int b; + while ((b = in.read()) >= 0) { + if (b == match[found]) { + found++; + if (found == len) { + return true; + } + } else { + if (out != null) { + if (found > 0) { + out.write(match, 0, found); + } + out.write(b); + } + found = 0; + } + } + return false; + } + + /** + * Locates an XMP packet in a stream, parses it and returns the XMP metadata. If no + * XMP packet is found until the stream ends, null is returned. Note: This method + * only finds the first XMP packet in a stream. And it cannot determine whether it + * has found the right XMP packet if there are multiple packets. + * <p/> + * Does <em>not</em> close the stream. + * If XMP block was found reading can continue below the block. + * + * @param in the InputStream to search + * @param xmlOut to write the XMP packet to + * @return true if XMP packet is found, false otherwise + * @throws IOException if an I/O error occurs + * @throws TransformerException if an error occurs while parsing the XMP packet + */ + public boolean parse(InputStream in, OutputStream xmlOut) throws IOException { + if (!in.markSupported()) { + in = new java.io.BufferedInputStream(in); + } + boolean foundXMP = skipAfter(in, PACKET_HEADER); + if (!foundXMP) { + return false; + } + //TODO Inspect "begin" attribute! + if (!skipAfter(in, PACKET_HEADER_END)) { + throw new IOException("Invalid XMP packet header!"); + } + //TODO Do with TeeInputStream when Commons IO 1.4 is available + if (!skipAfter(in, PACKET_TRAILER, xmlOut)) { + throw new IOException("XMP packet not properly terminated!"); + } + return true; + } + +} + http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java b/tika-parser-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java new file mode 100644 index 0000000..849fd01 --- /dev/null +++ b/tika-parser-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xmp; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collection; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.junit.Test; + +public class JempboxExtractorTest { + + @Test + public void testParseJpeg() throws IOException, TikaException { + Metadata metadata = new Metadata(); + InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg"); + // set some values before extraction to see that they are overridden + metadata.set(TikaCoreProperties.TITLE, "old title"); + metadata.set(TikaCoreProperties.DESCRIPTION, "old description"); + metadata.set(TikaCoreProperties.CREATOR, "previous author"); + // ... or kept in case the field is multi-value + metadata.add(TikaCoreProperties.KEYWORDS, "oldkeyword"); + + JempboxExtractor extractor = new JempboxExtractor(metadata); + extractor.parse(stream); + + // DublinCore fields + assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR)); + Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); + assertTrue(keywords.contains("oldkeyword")); + assertTrue(keywords.contains("grazelands")); + assertTrue(keywords.contains("nature reserve")); + assertTrue(keywords.contains("bird watching")); + assertTrue(keywords.contains("coast")); + Collection<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT)); + assertTrue(subject.contains("oldkeyword")); + assertTrue(subject.contains("grazelands")); + assertTrue(subject.contains("nature reserve")); + assertTrue(subject.contains("bird watching")); + assertTrue(subject.contains("coast")); + } + + @Test + public void testParseJpegPhotoshop() throws IOException, TikaException { + Metadata metadata = new Metadata(); + InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg"); + + JempboxExtractor extractor = new JempboxExtractor(metadata); + extractor.parse(stream); + + // DublinCore fields + assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE)); + assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); + assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR)); + Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); + assertTrue(keywords.contains("bird watching")); + assertTrue(keywords.contains("coast")); + } + + @Test + public void testParseJpegXnviewmp() throws IOException, TikaException { + Metadata metadata = new Metadata(); + InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg"); + + JempboxExtractor extractor = new JempboxExtractor(metadata); + extractor.parse(stream); + + // XnViewMp fields not understood by Jempbox + assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); + Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); + assertTrue(keywords.contains("coast")); + assertTrue(keywords.contains("nature reserve")); + } + + @Test + public void testJoinCreators() { + assertEquals("Mr B", new JempboxExtractor(null).joinCreators( + Arrays.asList("Mr B"))); + // TODO use multi-value property instead? + assertEquals("Mr B, Mr A", new JempboxExtractor(null).joinCreators( + Arrays.asList("Mr B", "Mr A"))); + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-xmp-module/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-xmp-module/pom.xml b/tika-parser-modules/tika-parser-xmp-module/pom.xml deleted file mode 100644 index 2101075..0000000 --- a/tika-parser-modules/tika-parser-xmp-module/pom.xml +++ /dev/null @@ -1,52 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor - license agreements. See the NOTICE file distributed with this work for additional - information regarding copyright ownership. The ASF licenses this file to - you under the Apache License, Version 2.0 (the "License"); you may not use - this file except in compliance with the License. You may obtain a copy of - the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required - by applicable law or agreed to in writing, software distributed under the - License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS - OF ANY KIND, either express or implied. See the License for the specific - language governing permissions and limitations under the License. --> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - - <parent> - <groupId>org.apache.tika</groupId> - <artifactId>tika-parser-modules</artifactId> - <version>2.0-SNAPSHOT</version> - </parent> - - <artifactId>tika-parser-xmp-module</artifactId> - <name>Apache Tika parser xmp module</name> - <url>http://tika.apache.org/</url> - - <properties> - <mime4j.version>0.7.2</mime4j.version> - </properties> - - <dependencies> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>tika-core</artifactId> - <version>${project.version}</version> - </dependency> - <dependency> - <groupId>org.apache.pdfbox</groupId> - <artifactId>jempbox</artifactId> - <version>${pdfbox.version}</version> - </dependency> - </dependencies> - - <build> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-dependency-plugin</artifactId> - </plugin> - </plugins> - </build> - -</project> http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/module/xmp/internal/Activator.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/module/xmp/internal/Activator.java b/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/module/xmp/internal/Activator.java deleted file mode 100644 index 4161c6e..0000000 --- a/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/module/xmp/internal/Activator.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.module.xmp.internal; - -import org.apache.tika.osgi.TikaAbstractBundleActivator; -import org.osgi.framework.BundleContext; - -public class Activator extends TikaAbstractBundleActivator { - - @Override - public void start(BundleContext context) throws Exception { - - registerTikaParserServiceLoader(context, Activator.class.getClassLoader()); - - } - - @Override - public void stop(BundleContext context) throws Exception { - - } - -} http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java b/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java deleted file mode 100644 index aa72896..0000000 --- a/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java +++ /dev/null @@ -1,187 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.xmp; - -import static java.nio.charset.StandardCharsets.UTF_8; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; -import java.util.Calendar; -import java.util.List; - -import org.apache.jempbox.xmp.ResourceEvent; -import org.apache.jempbox.xmp.ResourceRef; -import org.apache.jempbox.xmp.XMPMetadata; -import org.apache.jempbox.xmp.XMPSchemaDublinCore; -import org.apache.jempbox.xmp.XMPSchemaMediaManagement; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Property; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.metadata.XMPMM; -import org.apache.tika.utils.DateUtils; -import org.xml.sax.InputSource; - -public class JempboxExtractor { - - // The XMP spec says it must be unicode, but for most file formats it specifies "must be encoded in UTF-8" - private static final String DEFAULT_XMP_CHARSET = UTF_8.name(); - private XMPPacketScanner scanner = new XMPPacketScanner(); - private Metadata metadata; - - public JempboxExtractor(Metadata metadata) { - this.metadata = metadata; - } - - public void parse(InputStream file) throws IOException, TikaException { - ByteArrayOutputStream xmpraw = new ByteArrayOutputStream(); - if (!scanner.parse(file, xmpraw)) { - return; - } - - Reader decoded = new InputStreamReader( - new ByteArrayInputStream(xmpraw.toByteArray()), - DEFAULT_XMP_CHARSET); - XMPMetadata xmp = null; - try { - xmp = XMPMetadata.load(new InputSource(decoded)); - } catch (IOException e) { - // - } - - if (xmp == null) { - return; - } - XMPSchemaDublinCore dc = null; - try { - dc = xmp.getDublinCoreSchema(); - } catch (IOException e) { - } - - if (dc != null) { - if (dc.getTitle() != null) { - metadata.set(TikaCoreProperties.TITLE, dc.getTitle()); - } - if (dc.getDescription() != null) { - metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription()); - } - if (dc.getCreators() != null && dc.getCreators().size() > 0) { - metadata.set(TikaCoreProperties.CREATOR, joinCreators(dc.getCreators())); - } - if (dc.getSubjects() != null && dc.getSubjects().size() > 0) { - for (String keyword : dc.getSubjects()) { - metadata.add(TikaCoreProperties.KEYWORDS, keyword); - } - // TODO should we set KEYWORDS too? - // All tested photo managers set the same in Iptc.Application2.Keywords and Xmp.dc.subject - } - } - extractXMPMM(xmp, metadata); - } - - protected String joinCreators(List<String> creators) { - if (creators == null || creators.size() == 0) { - return ""; - } - if (creators.size() == 1) { - return creators.get(0); - } - StringBuffer c = new StringBuffer(); - for (String s : creators) { - c.append(", ").append(s); - } - return c.substring(2); - } - - /** - * Extracts Media Management metadata from XMP. - * - * Silently swallows exceptions. - * @param xmp - * @param metadata - */ - public static void extractXMPMM(XMPMetadata xmp, Metadata metadata) { - XMPSchemaMediaManagement mmSchema = null; - try { - mmSchema = xmp.getMediaManagementSchema(); - } catch (IOException e) { - //swallow - return; - } - if (mmSchema != null) { - addMetadata(metadata, XMPMM.DOCUMENTID, mmSchema.getDocumentID()); - //not currently supported by JempBox... -// metadata.set(XMPMM.INSTANCEID, mmSchema.getInstanceID()); - - ResourceRef derivedFrom = mmSchema.getDerivedFrom(); - if (derivedFrom != null) { - try { - addMetadata(metadata, XMPMM.DERIVED_FROM_DOCUMENTID, derivedFrom.getDocumentID()); - } catch (NullPointerException e) {} - - try { - addMetadata(metadata, XMPMM.DERIVED_FROM_INSTANCEID, derivedFrom.getInstanceID()); - } catch (NullPointerException e) {} - - //TODO: not yet supported by XMPBox...extract OriginalDocumentID - //in DerivedFrom section - } - if (mmSchema.getHistory() != null) { - for (ResourceEvent stevt : mmSchema.getHistory()) { - String instanceId = null; - String action = null; - Calendar when = null; - String softwareAgent = null; - try { - instanceId = stevt.getInstanceID(); - action = stevt.getAction(); - when = stevt.getWhen(); - softwareAgent = stevt.getSoftwareAgent(); - - //instanceid can throw npe; getWhen can throw IOException - } catch (NullPointerException|IOException e) { - //swallow - } - if (instanceId != null && instanceId.trim().length() > 0) { - //for absent data elements, pass in empty strings so - //that parallel arrays will have matching offsets - //for absent data - - action = (action == null) ? "" : action; - String dateString = (when == null) ? "" : DateUtils.formatDate(when); - softwareAgent = (softwareAgent == null) ? "" : softwareAgent; - - metadata.add(XMPMM.HISTORY_EVENT_INSTANCEID, instanceId); - metadata.add(XMPMM.HISTORY_ACTION, action); - metadata.add(XMPMM.HISTORY_WHEN, dateString); - metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT, softwareAgent); - } - } - } - } - } - - private static void addMetadata(Metadata m, Property p, String value) { - if (value != null) { - m.add(p, value); - } - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java b/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java deleted file mode 100644 index 70018cd..0000000 --- a/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* $Id: XMPPacketParser.java 750418 2009-03-05 11:03:54Z vhennebert $ */ - -package org.apache.tika.parser.xmp; - -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; - -import static java.nio.charset.StandardCharsets.US_ASCII; - -/** - * This class is a parser for XMP packets. By default, it tries to locate the first XMP packet - * it finds and parses it. - * <p/> - * Important: Before you use this class to look for an XMP packet in some random file, please read - * the chapter on "Scanning Files for XMP Packets" in the XMP specification! - * <p/> - * Thic class was branched from http://xmlgraphics.apache.org/ XMPPacketParser. - * See also org.semanticdesktop.aperture.extractor.xmp.XMPExtractor, a variant. - */ -public class XMPPacketScanner { - - private static final byte[] PACKET_HEADER; - private static final byte[] PACKET_HEADER_END; - private static final byte[] PACKET_TRAILER; - - static { - PACKET_HEADER = "<?xpacket begin=".getBytes(US_ASCII); - PACKET_HEADER_END = "?>".getBytes(US_ASCII); - PACKET_TRAILER = "<?xpacket".getBytes(US_ASCII); - } - - private static boolean skipAfter(InputStream in, byte[] match) throws IOException { - return skipAfter(in, match, null); - } - - private static boolean skipAfter(InputStream in, byte[] match, OutputStream out) - throws IOException { - int found = 0; - int len = match.length; - int b; - while ((b = in.read()) >= 0) { - if (b == match[found]) { - found++; - if (found == len) { - return true; - } - } else { - if (out != null) { - if (found > 0) { - out.write(match, 0, found); - } - out.write(b); - } - found = 0; - } - } - return false; - } - - /** - * Locates an XMP packet in a stream, parses it and returns the XMP metadata. If no - * XMP packet is found until the stream ends, null is returned. Note: This method - * only finds the first XMP packet in a stream. And it cannot determine whether it - * has found the right XMP packet if there are multiple packets. - * <p/> - * Does <em>not</em> close the stream. - * If XMP block was found reading can continue below the block. - * - * @param in the InputStream to search - * @param xmlOut to write the XMP packet to - * @return true if XMP packet is found, false otherwise - * @throws IOException if an I/O error occurs - * @throws TransformerException if an error occurs while parsing the XMP packet - */ - public boolean parse(InputStream in, OutputStream xmlOut) throws IOException { - if (!in.markSupported()) { - in = new java.io.BufferedInputStream(in); - } - boolean foundXMP = skipAfter(in, PACKET_HEADER); - if (!foundXMP) { - return false; - } - //TODO Inspect "begin" attribute! - if (!skipAfter(in, PACKET_HEADER_END)) { - throw new IOException("Invalid XMP packet header!"); - } - //TODO Do with TeeInputStream when Commons IO 1.4 is available - if (!skipAfter(in, PACKET_TRAILER, xmlOut)) { - throw new IOException("XMP packet not properly terminated!"); - } - return true; - } - -} - http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-xmp-module/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-xmp-module/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java b/tika-parser-modules/tika-parser-xmp-module/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java deleted file mode 100644 index 849fd01..0000000 --- a/tika-parser-modules/tika-parser-xmp-module/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.xmp; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.io.IOException; -import java.io.InputStream; -import java.util.Arrays; -import java.util.Collection; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.junit.Test; - -public class JempboxExtractorTest { - - @Test - public void testParseJpeg() throws IOException, TikaException { - Metadata metadata = new Metadata(); - InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg"); - // set some values before extraction to see that they are overridden - metadata.set(TikaCoreProperties.TITLE, "old title"); - metadata.set(TikaCoreProperties.DESCRIPTION, "old description"); - metadata.set(TikaCoreProperties.CREATOR, "previous author"); - // ... or kept in case the field is multi-value - metadata.add(TikaCoreProperties.KEYWORDS, "oldkeyword"); - - JempboxExtractor extractor = new JempboxExtractor(metadata); - extractor.parse(stream); - - // DublinCore fields - assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); - assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR)); - Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); - assertTrue(keywords.contains("oldkeyword")); - assertTrue(keywords.contains("grazelands")); - assertTrue(keywords.contains("nature reserve")); - assertTrue(keywords.contains("bird watching")); - assertTrue(keywords.contains("coast")); - Collection<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT)); - assertTrue(subject.contains("oldkeyword")); - assertTrue(subject.contains("grazelands")); - assertTrue(subject.contains("nature reserve")); - assertTrue(subject.contains("bird watching")); - assertTrue(subject.contains("coast")); - } - - @Test - public void testParseJpegPhotoshop() throws IOException, TikaException { - Metadata metadata = new Metadata(); - InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg"); - - JempboxExtractor extractor = new JempboxExtractor(metadata); - extractor.parse(stream); - - // DublinCore fields - assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE)); - assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); - assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR)); - Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); - assertTrue(keywords.contains("bird watching")); - assertTrue(keywords.contains("coast")); - } - - @Test - public void testParseJpegXnviewmp() throws IOException, TikaException { - Metadata metadata = new Metadata(); - InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg"); - - JempboxExtractor extractor = new JempboxExtractor(metadata); - extractor.parse(stream); - - // XnViewMp fields not understood by Jempbox - assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION)); - Collection<String> keywords = Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)); - assertTrue(keywords.contains("coast")); - assertTrue(keywords.contains("nature reserve")); - } - - @Test - public void testJoinCreators() { - assertEquals("Mr B", new JempboxExtractor(null).joinCreators( - Arrays.asList("Mr B"))); - // TODO use multi-value property instead? - assertEquals("Mr B, Mr A", new JempboxExtractor(null).joinCreators( - Arrays.asList("Mr B", "Mr A"))); - } - -}
