tika git commit: TIKA-1894 -- clean up following recommendations from Ray Gauss and Bob Paulin.

tallison Sun, 20 Mar 2016 00:20:11 -0700

Repository: tika
Updated Branches:
  refs/heads/2.x 5f413ffa7 -> c58af959b



TIKA-1894 -- clean up following recommendations from Ray Gauss and Bob Paulin.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/c58af959
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/c58af959
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/c58af959

Branch: refs/heads/2.x
Commit: c58af959b6cc3f3a3d8f555d53b147388e36b01d
Parents: 5f413ff
Author: tballison <[email protected]>
Authored: Fri Mar 18 12:24:13 2016 -0400
Committer: tballison <[email protected]>
Committed: Fri Mar 18 12:24:13 2016 -0400

----------------------------------------------------------------------
 .../tika-parser-multimedia-bundle/pom.xml       |   2 +-
 .../tika-parser-pdf-bundle/pom.xml              |   2 +-
 tika-parser-modules/pom.xml                     |   2 +-
 .../tika-parser-multimedia-module/pom.xml       |   2 +-
 .../tika-parser-pdf-module/pom.xml              |   2 +-
 .../tika-parser-xmp-commons/pom.xml             |  48 +++++
 .../tika/parser/xmp/JempboxExtractor.java       | 187 +++++++++++++++++++
 .../tika/parser/xmp/XMPPacketScanner.java       | 113 +++++++++++
 .../tika/parser/xmp/JempboxExtractorTest.java   | 107 +++++++++++
 .../tika-parser-xmp-module/pom.xml              |  52 ------
 .../tika/module/xmp/internal/Activator.java     |  36 ----
 .../tika/parser/xmp/JempboxExtractor.java       | 187 -------------------
 .../tika/parser/xmp/XMPPacketScanner.java       | 113 -----------
 .../tika/parser/xmp/JempboxExtractorTest.java   | 107 -----------
 14 files changed, 460 insertions(+), 500 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml 
b/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
index 7b528bc..067d6f3 100644
--- a/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-multimedia-bundle/pom.xml
@@ -45,7 +45,7 @@
               com.sun.xml.internal.bind.marshaller</_runsystempackages>
             <Embed-Dependency>
               tika-parser-multimedia-module;inline=true,
-              tika-parser-xmp-module;inline=true,
+              tika-parser-xmp-commons;inline=true,
               metadata-extractor;inline=true,
               xmpcore;inline=true,
               commons-codec;inline=true,

http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml 
b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
index 27773a8..771389a 100644
--- a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
@@ -47,7 +47,7 @@
             
<Bundle-Activator>org.apache.tika.module.pdf.internal.Activator</Bundle-Activator>
             <Embed-Dependency>
               tika-parser-pdf-module;inline=true,
-              tika-parser-xmp-module;inline=true,
+              tika-parser-xmp-commons;inline=true,
               commons-io;inline=true,
               pdfbox;inline=true,
               bcmail-jdk15on;inline=true,

http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/pom.xml b/tika-parser-modules/pom.xml
index 6342ebb..88205ca 100644
--- a/tika-parser-modules/pom.xml
+++ b/tika-parser-modules/pom.xml
@@ -56,7 +56,7 @@
     <module>tika-parser-scientific-module</module>
     <module>tika-parser-text-module</module>
     <module>tika-parser-web-module</module>
-    <module>tika-parser-xmp-module</module>
+    <module>tika-parser-xmp-commons</module>
   </modules>
 
   <dependencies>

http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-multimedia-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/pom.xml 
b/tika-parser-modules/tika-parser-multimedia-module/pom.xml
index 63ea5aa..632ed86 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/pom.xml
+++ b/tika-parser-modules/tika-parser-multimedia-module/pom.xml
@@ -37,7 +37,7 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-xmp-module</artifactId>
+      <artifactId>tika-parser-xmp-commons</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>

http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-pdf-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/pom.xml 
b/tika-parser-modules/tika-parser-pdf-module/pom.xml
index a706ff3..48f8eec 100644
--- a/tika-parser-modules/tika-parser-pdf-module/pom.xml
+++ b/tika-parser-modules/tika-parser-pdf-module/pom.xml
@@ -35,7 +35,7 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-parser-xmp-module</artifactId>
+      <artifactId>tika-parser-xmp-commons</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>

http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-xmp-commons/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-xmp-commons/pom.xml 
b/tika-parser-modules/tika-parser-xmp-commons/pom.xml
new file mode 100644
index 0000000..80729cc
--- /dev/null
+++ b/tika-parser-modules/tika-parser-xmp-commons/pom.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more 
contributor 
+  license agreements. See the NOTICE file distributed with this work for 
additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-parser-xmp-commons</artifactId>
+  <name>Apache Tika parser xmp commons</name>
+  <url>http://tika.apache.org/</url>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.pdfbox</groupId>
+      <artifactId>jempbox</artifactId>
+      <version>${pdfbox.version}</version>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>

http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java
 
b/tika-parser-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java
new file mode 100644
index 0000000..aa72896
--- /dev/null
+++ 
b/tika-parser-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xmp;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Calendar;
+import java.util.List;
+
+import org.apache.jempbox.xmp.ResourceEvent;
+import org.apache.jempbox.xmp.ResourceRef;
+import org.apache.jempbox.xmp.XMPMetadata;
+import org.apache.jempbox.xmp.XMPSchemaDublinCore;
+import org.apache.jempbox.xmp.XMPSchemaMediaManagement;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPMM;
+import org.apache.tika.utils.DateUtils;
+import org.xml.sax.InputSource;
+
+public class JempboxExtractor {
+
+    // The XMP spec says it must be unicode, but for most file formats it 
specifies "must be encoded in UTF-8"
+    private static final String DEFAULT_XMP_CHARSET = UTF_8.name();
+    private XMPPacketScanner scanner = new XMPPacketScanner();
+    private Metadata metadata;
+
+    public JempboxExtractor(Metadata metadata) {
+        this.metadata = metadata;
+    }
+
+    public void parse(InputStream file) throws IOException, TikaException {
+        ByteArrayOutputStream xmpraw = new ByteArrayOutputStream();
+        if (!scanner.parse(file, xmpraw)) {
+            return;
+        }
+
+        Reader decoded = new InputStreamReader(
+                new ByteArrayInputStream(xmpraw.toByteArray()),
+                DEFAULT_XMP_CHARSET);
+        XMPMetadata xmp = null;
+        try {
+            xmp = XMPMetadata.load(new InputSource(decoded));
+        } catch (IOException e) {
+            //
+        }
+
+        if (xmp == null) {
+            return;
+        }
+        XMPSchemaDublinCore dc = null;
+        try {
+            dc = xmp.getDublinCoreSchema();
+        } catch (IOException e) {
+        }
+
+        if (dc != null) {
+            if (dc.getTitle() != null) {
+                metadata.set(TikaCoreProperties.TITLE, dc.getTitle());
+            }
+            if (dc.getDescription() != null) {
+                metadata.set(TikaCoreProperties.DESCRIPTION, 
dc.getDescription());
+            }
+            if (dc.getCreators() != null && dc.getCreators().size() > 0) {
+                metadata.set(TikaCoreProperties.CREATOR, 
joinCreators(dc.getCreators()));
+            }
+            if (dc.getSubjects() != null && dc.getSubjects().size() > 0) {
+                for (String keyword : dc.getSubjects()) {
+                    metadata.add(TikaCoreProperties.KEYWORDS, keyword);
+                }
+                // TODO should we set KEYWORDS too?
+                // All tested photo managers set the same in 
Iptc.Application2.Keywords and Xmp.dc.subject
+            }
+        }
+        extractXMPMM(xmp, metadata);
+    }
+
+    protected String joinCreators(List<String> creators) {
+        if (creators == null || creators.size() == 0) {
+            return "";
+        }
+        if (creators.size() == 1) {
+            return creators.get(0);
+        }
+        StringBuffer c = new StringBuffer();
+        for (String s : creators) {
+            c.append(", ").append(s);
+        }
+        return c.substring(2);
+    }
+
+    /**
+     * Extracts Media Management metadata from XMP.
+     *
+     * Silently swallows exceptions.
+     * @param xmp
+     * @param metadata
+     */
+    public static void extractXMPMM(XMPMetadata xmp, Metadata metadata) {
+        XMPSchemaMediaManagement mmSchema = null;
+        try {
+            mmSchema = xmp.getMediaManagementSchema();
+        } catch (IOException e) {
+            //swallow
+            return;
+        }
+        if (mmSchema != null) {
+            addMetadata(metadata, XMPMM.DOCUMENTID, mmSchema.getDocumentID());
+            //not currently supported by JempBox...
+//          metadata.set(XMPMM.INSTANCEID, mmSchema.getInstanceID());
+
+            ResourceRef derivedFrom = mmSchema.getDerivedFrom();
+            if (derivedFrom != null) {
+                try {
+                    addMetadata(metadata, XMPMM.DERIVED_FROM_DOCUMENTID, 
derivedFrom.getDocumentID());
+                } catch (NullPointerException e) {}
+
+                try {
+                    addMetadata(metadata, XMPMM.DERIVED_FROM_INSTANCEID, 
derivedFrom.getInstanceID());
+                } catch (NullPointerException e) {}
+
+                //TODO: not yet supported by XMPBox...extract 
OriginalDocumentID
+                //in DerivedFrom section
+            }
+            if (mmSchema.getHistory() != null) {
+                for (ResourceEvent stevt : mmSchema.getHistory()) {
+                    String instanceId = null;
+                    String action = null;
+                    Calendar when = null;
+                    String softwareAgent = null;
+                    try {
+                        instanceId = stevt.getInstanceID();
+                        action = stevt.getAction();
+                        when = stevt.getWhen();
+                        softwareAgent = stevt.getSoftwareAgent();
+
+                        //instanceid can throw npe; getWhen can throw 
IOException
+                    } catch (NullPointerException|IOException e) {
+                       //swallow
+                    }
+                    if (instanceId != null && instanceId.trim().length() > 0) {
+                        //for absent data elements, pass in empty strings so
+                        //that parallel arrays will have matching offsets
+                        //for absent data
+
+                        action = (action == null) ? "" : action;
+                        String dateString = (when == null) ? "" : 
DateUtils.formatDate(when);
+                        softwareAgent = (softwareAgent == null) ? "" : 
softwareAgent;
+
+                        metadata.add(XMPMM.HISTORY_EVENT_INSTANCEID, 
instanceId);
+                        metadata.add(XMPMM.HISTORY_ACTION, action);
+                        metadata.add(XMPMM.HISTORY_WHEN, dateString);
+                        metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT, 
softwareAgent);
+                    }
+                }
+            }
+        }
+    }
+
+    private static void addMetadata(Metadata m, Property p, String value) {
+        if (value != null) {
+            m.add(p, value);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java
 
b/tika-parser-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java
new file mode 100644
index 0000000..70018cd
--- /dev/null
+++ 
b/tika-parser-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* $Id: XMPPacketParser.java 750418 2009-03-05 11:03:54Z vhennebert $ */
+
+package org.apache.tika.parser.xmp;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+
+/**
+ * This class is a parser for XMP packets. By default, it tries to locate the 
first XMP packet
+ * it finds and parses it.
+ * <p/>
+ * Important: Before you use this class to look for an XMP packet in some 
random file, please read
+ * the chapter on "Scanning Files for XMP Packets" in the XMP specification!
+ * <p/>
+ * Thic class was branched from http://xmlgraphics.apache.org/ XMPPacketParser.
+ * See also org.semanticdesktop.aperture.extractor.xmp.XMPExtractor, a variant.
+ */
+public class XMPPacketScanner {
+
+    private static final byte[] PACKET_HEADER;
+    private static final byte[] PACKET_HEADER_END;
+    private static final byte[] PACKET_TRAILER;
+
+    static {
+        PACKET_HEADER = "<?xpacket begin=".getBytes(US_ASCII);
+        PACKET_HEADER_END = "?>".getBytes(US_ASCII);
+        PACKET_TRAILER = "<?xpacket".getBytes(US_ASCII);
+    }
+
+    private static boolean skipAfter(InputStream in, byte[] match) throws 
IOException {
+        return skipAfter(in, match, null);
+    }
+
+    private static boolean skipAfter(InputStream in, byte[] match, 
OutputStream out)
+            throws IOException {
+        int found = 0;
+        int len = match.length;
+        int b;
+        while ((b = in.read()) >= 0) {
+            if (b == match[found]) {
+                found++;
+                if (found == len) {
+                    return true;
+                }
+            } else {
+                if (out != null) {
+                    if (found > 0) {
+                        out.write(match, 0, found);
+                    }
+                    out.write(b);
+                }
+                found = 0;
+            }
+        }
+        return false;
+    }
+
+    /**
+     * Locates an XMP packet in a stream, parses it and returns the XMP 
metadata. If no
+     * XMP packet is found until the stream ends, null is returned. Note: This 
method
+     * only finds the first XMP packet in a stream. And it cannot determine 
whether it
+     * has found the right XMP packet if there are multiple packets.
+     * <p/>
+     * Does <em>not</em> close the stream.
+     * If XMP block was found reading can continue below the block.
+     *
+     * @param in     the InputStream to search
+     * @param xmlOut to write the XMP packet to
+     * @return true if XMP packet is found, false otherwise
+     * @throws IOException          if an I/O error occurs
+     * @throws TransformerException if an error occurs while parsing the XMP 
packet
+     */
+    public boolean parse(InputStream in, OutputStream xmlOut) throws 
IOException {
+        if (!in.markSupported()) {
+            in = new java.io.BufferedInputStream(in);
+        }
+        boolean foundXMP = skipAfter(in, PACKET_HEADER);
+        if (!foundXMP) {
+            return false;
+        }
+        //TODO Inspect "begin" attribute!
+        if (!skipAfter(in, PACKET_HEADER_END)) {
+            throw new IOException("Invalid XMP packet header!");
+        }
+        //TODO Do with TeeInputStream when Commons IO 1.4 is available
+        if (!skipAfter(in, PACKET_TRAILER, xmlOut)) {
+            throw new IOException("XMP packet not properly terminated!");
+        }
+        return true;
+    }
+
+}
+

http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
 
b/tika-parser-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
new file mode 100644
index 0000000..849fd01
--- /dev/null
+++ 
b/tika-parser-modules/tika-parser-xmp-commons/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xmp;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+public class JempboxExtractorTest {
+
+    @Test
+    public void testParseJpeg() throws IOException, TikaException {
+        Metadata metadata = new Metadata();
+        InputStream stream = 
getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg");
+        // set some values before extraction to see that they are overridden
+        metadata.set(TikaCoreProperties.TITLE, "old title");
+        metadata.set(TikaCoreProperties.DESCRIPTION, "old description");
+        metadata.set(TikaCoreProperties.CREATOR, "previous author");
+        // ... or kept in case the field is multi-value
+        metadata.add(TikaCoreProperties.KEYWORDS, "oldkeyword");
+
+        JempboxExtractor extractor = new JempboxExtractor(metadata);
+        extractor.parse(stream);
+
+        // DublinCore fields
+        assertEquals("Tosteberga \u00C4ngar", 
metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new 
line)", metadata.get(TikaCoreProperties.DESCRIPTION));
+        assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR));
+        Collection<String> keywords = 
Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
+        assertTrue(keywords.contains("oldkeyword"));
+        assertTrue(keywords.contains("grazelands"));
+        assertTrue(keywords.contains("nature reserve"));
+        assertTrue(keywords.contains("bird watching"));
+        assertTrue(keywords.contains("coast"));
+        Collection<String> subject = 
Arrays.asList(metadata.getValues(Metadata.SUBJECT));
+        assertTrue(subject.contains("oldkeyword"));
+        assertTrue(subject.contains("grazelands"));
+        assertTrue(subject.contains("nature reserve"));
+        assertTrue(subject.contains("bird watching"));
+        assertTrue(subject.contains("coast"));
+    }
+
+    @Test
+    public void testParseJpegPhotoshop() throws IOException, TikaException {
+        Metadata metadata = new Metadata();
+        InputStream stream = 
getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg");
+
+        JempboxExtractor extractor = new JempboxExtractor(metadata);
+        extractor.parse(stream);
+
+        // DublinCore fields
+        assertEquals("Tosteberga \u00C4ngar", 
metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new 
line)", metadata.get(TikaCoreProperties.DESCRIPTION));
+        assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR));
+        Collection<String> keywords = 
Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
+        assertTrue(keywords.contains("bird watching"));
+        assertTrue(keywords.contains("coast"));
+    }
+
+    @Test
+    public void testParseJpegXnviewmp() throws IOException, TikaException {
+        Metadata metadata = new Metadata();
+        InputStream stream = 
getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg");
+
+        JempboxExtractor extractor = new JempboxExtractor(metadata);
+        extractor.parse(stream);
+
+        // XnViewMp fields not understood by Jempbox
+        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new 
line)", metadata.get(TikaCoreProperties.DESCRIPTION));
+        Collection<String> keywords = 
Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
+        assertTrue(keywords.contains("coast"));
+        assertTrue(keywords.contains("nature reserve"));
+    }
+
+    @Test
+    public void testJoinCreators() {
+        assertEquals("Mr B", new JempboxExtractor(null).joinCreators(
+                Arrays.asList("Mr B")));
+        // TODO use multi-value property instead?
+        assertEquals("Mr B, Mr A", new JempboxExtractor(null).joinCreators(
+                Arrays.asList("Mr B", "Mr A")));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-xmp-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-xmp-module/pom.xml 
b/tika-parser-modules/tika-parser-xmp-module/pom.xml
deleted file mode 100644
index 2101075..0000000
--- a/tika-parser-modules/tika-parser-xmp-module/pom.xml
+++ /dev/null
@@ -1,52 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more 
contributor 
-  license agreements. See the NOTICE file distributed with this work for 
additional 
-  information regarding copyright ownership. The ASF licenses this file to 
-  you under the Apache License, Version 2.0 (the "License"); you may not use 
-  this file except in compliance with the License. You may obtain a copy of 
-  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
-  by applicable law or agreed to in writing, software distributed under the 
-  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
-  OF ANY KIND, either express or implied. See the License for the specific 
-  language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>org.apache.tika</groupId>
-    <artifactId>tika-parser-modules</artifactId>
-    <version>2.0-SNAPSHOT</version>
-  </parent>
-
-  <artifactId>tika-parser-xmp-module</artifactId>
-  <name>Apache Tika parser xmp module</name>
-  <url>http://tika.apache.org/</url>
-  
-  <properties>
-    <mime4j.version>0.7.2</mime4j.version>
-  </properties>
-  
-  <dependencies>
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>tika-core</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.pdfbox</groupId>
-      <artifactId>jempbox</artifactId>
-      <version>${pdfbox.version}</version>
-    </dependency>
-  </dependencies>
-  
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-dependency-plugin</artifactId>
-      </plugin>
-    </plugins>
-  </build>
-
-</project>

http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/module/xmp/internal/Activator.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/module/xmp/internal/Activator.java
 
b/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/module/xmp/internal/Activator.java
deleted file mode 100644
index 4161c6e..0000000
--- 
a/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/module/xmp/internal/Activator.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.xmp.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
-    @Override
-    public void start(BundleContext context) throws Exception {
-
-        registerTikaParserServiceLoader(context, 
Activator.class.getClassLoader());
-
-    }
-
-    @Override
-    public void stop(BundleContext context) throws Exception {
-
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java
 
b/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java
deleted file mode 100644
index aa72896..0000000
--- 
a/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xmp;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.util.Calendar;
-import java.util.List;
-
-import org.apache.jempbox.xmp.ResourceEvent;
-import org.apache.jempbox.xmp.ResourceRef;
-import org.apache.jempbox.xmp.XMPMetadata;
-import org.apache.jempbox.xmp.XMPSchemaDublinCore;
-import org.apache.jempbox.xmp.XMPSchemaMediaManagement;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.XMPMM;
-import org.apache.tika.utils.DateUtils;
-import org.xml.sax.InputSource;
-
-public class JempboxExtractor {
-
-    // The XMP spec says it must be unicode, but for most file formats it 
specifies "must be encoded in UTF-8"
-    private static final String DEFAULT_XMP_CHARSET = UTF_8.name();
-    private XMPPacketScanner scanner = new XMPPacketScanner();
-    private Metadata metadata;
-
-    public JempboxExtractor(Metadata metadata) {
-        this.metadata = metadata;
-    }
-
-    public void parse(InputStream file) throws IOException, TikaException {
-        ByteArrayOutputStream xmpraw = new ByteArrayOutputStream();
-        if (!scanner.parse(file, xmpraw)) {
-            return;
-        }
-
-        Reader decoded = new InputStreamReader(
-                new ByteArrayInputStream(xmpraw.toByteArray()),
-                DEFAULT_XMP_CHARSET);
-        XMPMetadata xmp = null;
-        try {
-            xmp = XMPMetadata.load(new InputSource(decoded));
-        } catch (IOException e) {
-            //
-        }
-
-        if (xmp == null) {
-            return;
-        }
-        XMPSchemaDublinCore dc = null;
-        try {
-            dc = xmp.getDublinCoreSchema();
-        } catch (IOException e) {
-        }
-
-        if (dc != null) {
-            if (dc.getTitle() != null) {
-                metadata.set(TikaCoreProperties.TITLE, dc.getTitle());
-            }
-            if (dc.getDescription() != null) {
-                metadata.set(TikaCoreProperties.DESCRIPTION, 
dc.getDescription());
-            }
-            if (dc.getCreators() != null && dc.getCreators().size() > 0) {
-                metadata.set(TikaCoreProperties.CREATOR, 
joinCreators(dc.getCreators()));
-            }
-            if (dc.getSubjects() != null && dc.getSubjects().size() > 0) {
-                for (String keyword : dc.getSubjects()) {
-                    metadata.add(TikaCoreProperties.KEYWORDS, keyword);
-                }
-                // TODO should we set KEYWORDS too?
-                // All tested photo managers set the same in 
Iptc.Application2.Keywords and Xmp.dc.subject
-            }
-        }
-        extractXMPMM(xmp, metadata);
-    }
-
-    protected String joinCreators(List<String> creators) {
-        if (creators == null || creators.size() == 0) {
-            return "";
-        }
-        if (creators.size() == 1) {
-            return creators.get(0);
-        }
-        StringBuffer c = new StringBuffer();
-        for (String s : creators) {
-            c.append(", ").append(s);
-        }
-        return c.substring(2);
-    }
-
-    /**
-     * Extracts Media Management metadata from XMP.
-     *
-     * Silently swallows exceptions.
-     * @param xmp
-     * @param metadata
-     */
-    public static void extractXMPMM(XMPMetadata xmp, Metadata metadata) {
-        XMPSchemaMediaManagement mmSchema = null;
-        try {
-            mmSchema = xmp.getMediaManagementSchema();
-        } catch (IOException e) {
-            //swallow
-            return;
-        }
-        if (mmSchema != null) {
-            addMetadata(metadata, XMPMM.DOCUMENTID, mmSchema.getDocumentID());
-            //not currently supported by JempBox...
-//          metadata.set(XMPMM.INSTANCEID, mmSchema.getInstanceID());
-
-            ResourceRef derivedFrom = mmSchema.getDerivedFrom();
-            if (derivedFrom != null) {
-                try {
-                    addMetadata(metadata, XMPMM.DERIVED_FROM_DOCUMENTID, 
derivedFrom.getDocumentID());
-                } catch (NullPointerException e) {}
-
-                try {
-                    addMetadata(metadata, XMPMM.DERIVED_FROM_INSTANCEID, 
derivedFrom.getInstanceID());
-                } catch (NullPointerException e) {}
-
-                //TODO: not yet supported by XMPBox...extract 
OriginalDocumentID
-                //in DerivedFrom section
-            }
-            if (mmSchema.getHistory() != null) {
-                for (ResourceEvent stevt : mmSchema.getHistory()) {
-                    String instanceId = null;
-                    String action = null;
-                    Calendar when = null;
-                    String softwareAgent = null;
-                    try {
-                        instanceId = stevt.getInstanceID();
-                        action = stevt.getAction();
-                        when = stevt.getWhen();
-                        softwareAgent = stevt.getSoftwareAgent();
-
-                        //instanceid can throw npe; getWhen can throw 
IOException
-                    } catch (NullPointerException|IOException e) {
-                       //swallow
-                    }
-                    if (instanceId != null && instanceId.trim().length() > 0) {
-                        //for absent data elements, pass in empty strings so
-                        //that parallel arrays will have matching offsets
-                        //for absent data
-
-                        action = (action == null) ? "" : action;
-                        String dateString = (when == null) ? "" : 
DateUtils.formatDate(when);
-                        softwareAgent = (softwareAgent == null) ? "" : 
softwareAgent;
-
-                        metadata.add(XMPMM.HISTORY_EVENT_INSTANCEID, 
instanceId);
-                        metadata.add(XMPMM.HISTORY_ACTION, action);
-                        metadata.add(XMPMM.HISTORY_WHEN, dateString);
-                        metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT, 
softwareAgent);
-                    }
-                }
-            }
-        }
-    }
-
-    private static void addMetadata(Metadata m, Property p, String value) {
-        if (value != null) {
-            m.add(p, value);
-        }
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java
 
b/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java
deleted file mode 100644
index 70018cd..0000000
--- 
a/tika-parser-modules/tika-parser-xmp-module/src/main/java/org/apache/tika/parser/xmp/XMPPacketScanner.java
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* $Id: XMPPacketParser.java 750418 2009-03-05 11:03:54Z vhennebert $ */
-
-package org.apache.tika.parser.xmp;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
-
-/**
- * This class is a parser for XMP packets. By default, it tries to locate the 
first XMP packet
- * it finds and parses it.
- * <p/>
- * Important: Before you use this class to look for an XMP packet in some 
random file, please read
- * the chapter on "Scanning Files for XMP Packets" in the XMP specification!
- * <p/>
- * Thic class was branched from http://xmlgraphics.apache.org/ XMPPacketParser.
- * See also org.semanticdesktop.aperture.extractor.xmp.XMPExtractor, a variant.
- */
-public class XMPPacketScanner {
-
-    private static final byte[] PACKET_HEADER;
-    private static final byte[] PACKET_HEADER_END;
-    private static final byte[] PACKET_TRAILER;
-
-    static {
-        PACKET_HEADER = "<?xpacket begin=".getBytes(US_ASCII);
-        PACKET_HEADER_END = "?>".getBytes(US_ASCII);
-        PACKET_TRAILER = "<?xpacket".getBytes(US_ASCII);
-    }
-
-    private static boolean skipAfter(InputStream in, byte[] match) throws 
IOException {
-        return skipAfter(in, match, null);
-    }
-
-    private static boolean skipAfter(InputStream in, byte[] match, 
OutputStream out)
-            throws IOException {
-        int found = 0;
-        int len = match.length;
-        int b;
-        while ((b = in.read()) >= 0) {
-            if (b == match[found]) {
-                found++;
-                if (found == len) {
-                    return true;
-                }
-            } else {
-                if (out != null) {
-                    if (found > 0) {
-                        out.write(match, 0, found);
-                    }
-                    out.write(b);
-                }
-                found = 0;
-            }
-        }
-        return false;
-    }
-
-    /**
-     * Locates an XMP packet in a stream, parses it and returns the XMP 
metadata. If no
-     * XMP packet is found until the stream ends, null is returned. Note: This 
method
-     * only finds the first XMP packet in a stream. And it cannot determine 
whether it
-     * has found the right XMP packet if there are multiple packets.
-     * <p/>
-     * Does <em>not</em> close the stream.
-     * If XMP block was found reading can continue below the block.
-     *
-     * @param in     the InputStream to search
-     * @param xmlOut to write the XMP packet to
-     * @return true if XMP packet is found, false otherwise
-     * @throws IOException          if an I/O error occurs
-     * @throws TransformerException if an error occurs while parsing the XMP 
packet
-     */
-    public boolean parse(InputStream in, OutputStream xmlOut) throws 
IOException {
-        if (!in.markSupported()) {
-            in = new java.io.BufferedInputStream(in);
-        }
-        boolean foundXMP = skipAfter(in, PACKET_HEADER);
-        if (!foundXMP) {
-            return false;
-        }
-        //TODO Inspect "begin" attribute!
-        if (!skipAfter(in, PACKET_HEADER_END)) {
-            throw new IOException("Invalid XMP packet header!");
-        }
-        //TODO Do with TeeInputStream when Commons IO 1.4 is available
-        if (!skipAfter(in, PACKET_TRAILER, xmlOut)) {
-            throw new IOException("XMP packet not properly terminated!");
-        }
-        return true;
-    }
-
-}
-

http://git-wip-us.apache.org/repos/asf/tika/blob/c58af959/tika-parser-modules/tika-parser-xmp-module/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-xmp-module/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
 
b/tika-parser-modules/tika-parser-xmp-module/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
deleted file mode 100644
index 849fd01..0000000
--- 
a/tika-parser-modules/tika-parser-xmp-module/src/test/java/org/apache/tika/parser/xmp/JempboxExtractorTest.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.xmp;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collection;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Test;
-
-public class JempboxExtractorTest {
-
-    @Test
-    public void testParseJpeg() throws IOException, TikaException {
-        Metadata metadata = new Metadata();
-        InputStream stream = 
getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg");
-        // set some values before extraction to see that they are overridden
-        metadata.set(TikaCoreProperties.TITLE, "old title");
-        metadata.set(TikaCoreProperties.DESCRIPTION, "old description");
-        metadata.set(TikaCoreProperties.CREATOR, "previous author");
-        // ... or kept in case the field is multi-value
-        metadata.add(TikaCoreProperties.KEYWORDS, "oldkeyword");
-
-        JempboxExtractor extractor = new JempboxExtractor(metadata);
-        extractor.parse(stream);
-
-        // DublinCore fields
-        assertEquals("Tosteberga \u00C4ngar", 
metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new 
line)", metadata.get(TikaCoreProperties.DESCRIPTION));
-        assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR));
-        Collection<String> keywords = 
Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
-        assertTrue(keywords.contains("oldkeyword"));
-        assertTrue(keywords.contains("grazelands"));
-        assertTrue(keywords.contains("nature reserve"));
-        assertTrue(keywords.contains("bird watching"));
-        assertTrue(keywords.contains("coast"));
-        Collection<String> subject = 
Arrays.asList(metadata.getValues(Metadata.SUBJECT));
-        assertTrue(subject.contains("oldkeyword"));
-        assertTrue(subject.contains("grazelands"));
-        assertTrue(subject.contains("nature reserve"));
-        assertTrue(subject.contains("bird watching"));
-        assertTrue(subject.contains("coast"));
-    }
-
-    @Test
-    public void testParseJpegPhotoshop() throws IOException, TikaException {
-        Metadata metadata = new Metadata();
-        InputStream stream = 
getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg");
-
-        JempboxExtractor extractor = new JempboxExtractor(metadata);
-        extractor.parse(stream);
-
-        // DublinCore fields
-        assertEquals("Tosteberga \u00C4ngar", 
metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new 
line)", metadata.get(TikaCoreProperties.DESCRIPTION));
-        assertEquals("Some Tourist", metadata.get(TikaCoreProperties.CREATOR));
-        Collection<String> keywords = 
Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
-        assertTrue(keywords.contains("bird watching"));
-        assertTrue(keywords.contains("coast"));
-    }
-
-    @Test
-    public void testParseJpegXnviewmp() throws IOException, TikaException {
-        Metadata metadata = new Metadata();
-        InputStream stream = 
getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg");
-
-        JempboxExtractor extractor = new JempboxExtractor(metadata);
-        extractor.parse(stream);
-
-        // XnViewMp fields not understood by Jempbox
-        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new 
line)", metadata.get(TikaCoreProperties.DESCRIPTION));
-        Collection<String> keywords = 
Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS));
-        assertTrue(keywords.contains("coast"));
-        assertTrue(keywords.contains("nature reserve"));
-    }
-
-    @Test
-    public void testJoinCreators() {
-        assertEquals("Mr B", new JempboxExtractor(null).joinCreators(
-                Arrays.asList("Mr B")));
-        // TODO use multi-value property instead?
-        assertEquals("Mr B, Mr A", new JempboxExtractor(null).joinCreators(
-                Arrays.asList("Mr B", "Mr A")));
-    }
-
-}

tika git commit: TIKA-1894 -- clean up following recommendations from Ray Gauss and Bob Paulin.

Reply via email to