This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3180
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/TIKA-3180 by this push:
     new cdd0bf1  TIKA-3180 -- modularize tika-server
cdd0bf1 is described below

commit cdd0bf135b9d725bb04f1a9736b9df8d0e57db21
Author: tallison <[email protected]>
AuthorDate: Wed Dec 16 11:39:33 2020 -0500

    TIKA-3180 -- modularize tika-server
---
 CHANGES.txt                                        |   3 +
 tika-app/pom.xml                                   |   2 +-
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |  32 ++-----
 .../extractor/DefaultEmbeddedStreamTranslator.java |  87 ++++++++++++++++++
 .../tika/extractor/EmbeddedStreamTranslator.java   |  38 ++++++++
 .../microsoft/MSEmbeddedStreamTranslator.java      | 101 +++++++++++++++++++++
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    |   4 +
 ....apache.tika.extractor.EmbeddedStreamTranslator |  15 +++
 tika-server/tika-server-classic/pom.xml            |   3 +-
 tika-server/tika-server-classic/src/TODO           |   8 --
 .../classic/resource/XMPMetadataResource.java      |  49 ++++++++++
 .../classic/writer/XMPMessageBodyWriter.java       |   4 +-
 ...he.tika.server.core.resource.TikaServerResource |  15 +++
 ...apache.tika.server.core.writer.TikaServerWriter |  15 +++
 .../tika/server/classic/MetadataResourceTest.java  |   6 +-
 .../org/apache/tika/server/core/TikaServerCli.java |  22 ++++-
 .../server/core/resource/MetadataResource.java     |   4 +-
 .../server/core/resource/TikaServerResource.java   |   4 +
 .../server/core/resource/UnpackerResource.java     |  51 ++---------
 .../tika/server/core/writer/TikaServerWriter.java  |  10 ++
 20 files changed, 389 insertions(+), 84 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 49a7ccc..0d61915 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -11,6 +11,9 @@ Release 2.0.0 - ???
    
    * General code cleanup (PeterAlfredLee)
 
+   * tika-server's /metadata endpoint requires tika-server-classic to write 
XMP/rdf output.
+     This output is not available in tika-server-core.
+
    Other changes
    
 Release 1.26 - ???
diff --git a/tika-app/pom.xml b/tika-app/pom.xml
index 461c45d..14525ef 100644
--- a/tika-app/pom.xml
+++ b/tika-app/pom.xml
@@ -112,7 +112,7 @@
               </createDependencyReducedPom>
               <artifactSet>
                 <excludes>
-                  <exclude>org.apache.tika:tika-parsers:jar:</exclude>
+                  
<exclude>org.apache.tika:tika-parsers-classic-package:jar:</exclude>
                 </excludes>
               </artifactSet>
               <filters>
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index e2557e4..b246207 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -74,7 +74,9 @@ import org.apache.tika.config.TikaConfigSerializer;
 import org.apache.tika.detect.CompositeDetector;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.DefaultEmbeddedStreamTranslator;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedStreamTranslator;
 import org.apache.tika.fork.ForkParser;
 import org.apache.tika.gui.TikaGUI;
 import org.apache.tika.io.TikaInputStream;
@@ -1041,6 +1043,7 @@ public class TikaCLI {
 
         private int count = 0;
         private final TikaConfig config = TikaConfig.getDefaultConfig();
+        private final EmbeddedStreamTranslator embeddedStreamTranslator = new 
DefaultEmbeddedStreamTranslator();
 
         public boolean shouldParseEmbedded(Metadata metadata) {
             return true;
@@ -1070,15 +1073,9 @@ public class TikaCLI {
             System.out.println("Extracting '"+name+"' ("+contentType+") to " + 
outputFile);
 
             try (FileOutputStream os = new FileOutputStream(outputFile)) {
-                if (inputStream instanceof TikaInputStream) {
-                    TikaInputStream tin = (TikaInputStream) inputStream;
-
-                    if (tin.getOpenContainer() != null && 
tin.getOpenContainer() instanceof DirectoryEntry) {
-                        POIFSFileSystem fs = new POIFSFileSystem();
-                        copy((DirectoryEntry) tin.getOpenContainer(), 
fs.getRoot());
-                        fs.writeFilesystem(os);
-                    } else {
-                        IOUtils.copy(inputStream, os);
+                if (embeddedStreamTranslator.shouldTranslate(inputStream, 
metadata)) {
+                    try (InputStream translated = 
embeddedStreamTranslator.translate(inputStream, metadata)) {
+                        IOUtils.copy(translated, os);
                     }
                 } else {
                     IOUtils.copy(inputStream, os);
@@ -1148,23 +1145,6 @@ public class TikaCLI {
             return ".bin";
 
         }
-
-        protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
-                throws IOException {
-            for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) {
-                if (entry instanceof DirectoryEntry) {
-                    // Need to recurse
-                    DirectoryEntry newDir = 
destDir.createDirectory(entry.getName());
-                    copy((DirectoryEntry) entry, newDir);
-                } else {
-                    // Copy entry
-                    try (InputStream contents =
-                            new DocumentInputStream((DocumentEntry) entry)) {
-                        destDir.createDocument(entry.getName(), contents);
-                    }
-                }
-            }
-        }
     }
 
     private class NoDocumentMetHandler extends DefaultHandler {
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
new file mode 100644
index 0000000..86af6c1
--- /dev/null
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.extractor;
+
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.utils.ServiceLoaderUtils;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+/**
+ * Loads EmbeddedStreamTranslators via service loading.  Tries to run each
+ * in turn and returns the first non-null value.  If no translation has 
occurred,
+ * this returns the original InputStream. If a translation has occurred, the
+ * translator will consume the InputStream but not close it.
+ */
+public class DefaultEmbeddedStreamTranslator implements 
EmbeddedStreamTranslator {
+
+    final List<EmbeddedStreamTranslator> translators;
+
+    private static List<EmbeddedStreamTranslator> 
getDefaultFilters(ServiceLoader loader) {
+        List<EmbeddedStreamTranslator> embeddedStreamTranslators
+                = loader.loadServiceProviders(EmbeddedStreamTranslator.class);
+        ServiceLoaderUtils.sortLoadedClasses(embeddedStreamTranslators);
+        return embeddedStreamTranslators;
+    }
+
+    public DefaultEmbeddedStreamTranslator() {
+        this(getDefaultFilters(new ServiceLoader()));
+    }
+
+    private DefaultEmbeddedStreamTranslator(List<EmbeddedStreamTranslator> 
translators) {
+        this.translators = translators;
+    }
+
+    /**
+     * This should sniff the stream to determine if it needs to be translated.
+     * The translator is responsible for resetting the stream if any bytes 
have been read.
+     * @param inputStream
+     * @param metadata
+     * @return
+     * @throws IOException
+     */
+    @Override
+    public boolean shouldTranslate(InputStream inputStream, Metadata metadata) 
throws IOException {
+        for (EmbeddedStreamTranslator translator : translators) {
+            if (translator.shouldTranslate(inputStream, metadata)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    /**
+     * This will consume the InputStream and return a new stream of translated 
bytes.
+     * @param inputStream
+     * @param metadata
+     * @return
+     * @throws IOException
+     */
+    @Override
+    public InputStream translate(InputStream inputStream, Metadata metadata) 
throws IOException {
+        for (EmbeddedStreamTranslator translator : translators) {
+            InputStream translated = translator.translate(inputStream, 
metadata);
+            if (translated != null) {
+                return translated;
+            }
+        }
+        return inputStream;
+    }
+}
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
new file mode 100644
index 0000000..c6387fe
--- /dev/null
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.extractor;
+
+import org.apache.tika.metadata.Metadata;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Interface for different filtering of embedded streams.
+ * Specifically, unravel OLE streams in tika-server unpack,
+ * and/or handle open containers in TikaInputStream
+ *
+ * @since Apache Tika 2.0.0
+ */
+public interface EmbeddedStreamTranslator {
+
+    boolean shouldTranslate(InputStream inputStream, Metadata metadata) throws 
IOException;
+
+    InputStream translate(InputStream inputStream,
+                          Metadata metadata) throws IOException;
+
+}
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
new file mode 100644
index 0000000..f8d157d
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
@@ -0,0 +1,101 @@
+package org.apache.tika.extractor.microsoft;
+
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.extractor.EmbeddedStreamTranslator;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+public class MSEmbeddedStreamTranslator implements EmbeddedStreamTranslator {
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(MSEmbeddedStreamTranslator.class);
+
+    @Override
+    public boolean shouldTranslate(InputStream inputStream, Metadata metadata) 
throws IOException {
+        String contentType = 
metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
+        if 
("application/vnd.openxmlformats-officedocument.oleObject".equals(contentType)) 
{
+            return true;
+        } else if (inputStream instanceof TikaInputStream) {
+            TikaInputStream tin = (TikaInputStream) inputStream;
+            if (tin.getOpenContainer() != null && tin.getOpenContainer() 
instanceof DirectoryEntry) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    @Override
+    public InputStream translate(InputStream inputStream, Metadata metadata) 
throws IOException {
+        String contentType = 
metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
+        if 
("application/vnd.openxmlformats-officedocument.oleObject".equals(contentType)) 
{
+            ByteArrayOutputStream bos = new ByteArrayOutputStream();
+            IOUtils.copy(inputStream, bos);
+            byte[] data = bos.toByteArray();
+            POIFSFileSystem poifs = new POIFSFileSystem(new 
ByteArrayInputStream(data));
+            OfficeParser.POIFSDocumentType type = 
OfficeParser.POIFSDocumentType.detectType(poifs);
+            String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+
+            if (type == OfficeParser.POIFSDocumentType.OLE10_NATIVE) {
+                try {
+                    Ole10Native ole = 
Ole10Native.createFromEmbeddedOleObject(poifs);
+                    if (ole.getDataSize() > 0) {
+                        String label = ole.getLabel();
+
+                        name = label;
+
+                        data = ole.getDataBuffer();
+                    }
+                } catch (Ole10NativeException ex) {
+                    LOG.warn("Skipping invalid part", ex);
+                }
+            } else {
+                name += '.' + type.getExtension();
+            }
+            metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+            return new ByteArrayInputStream(data);
+        } else if (inputStream instanceof TikaInputStream) {
+            TikaInputStream tin = (TikaInputStream) inputStream;
+
+            if (tin.getOpenContainer() != null && tin.getOpenContainer() 
instanceof DirectoryEntry) {
+                POIFSFileSystem fs = new POIFSFileSystem();
+                copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
+                ByteArrayOutputStream bos2 = new ByteArrayOutputStream();
+                fs.writeFilesystem(bos2);
+                bos2.close();
+                return new ByteArrayInputStream(bos2.toByteArray());
+            }
+        }
+        return inputStream;
+    }
+
+    protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
+            throws IOException {
+        for (Entry entry : sourceDir) {
+            if (entry instanceof DirectoryEntry) {
+                // Need to recurse
+                DirectoryEntry newDir = 
destDir.createDirectory(entry.getName());
+                copy((DirectoryEntry) entry, newDir);
+            } else {
+                // Copy entry
+                try (InputStream contents = new 
DocumentInputStream((DocumentEntry) entry)) {
+                    destDir.createDocument(entry.getName(), contents);
+                }
+            }
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 4ecebfb..6ab763c 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -31,6 +31,7 @@ import java.util.Set;
 
 import org.apache.poi.extractor.POITextExtractor;
 import org.apache.poi.ooxml.POIXMLDocument;
+import org.apache.poi.ooxml.extractor.ExtractorFactory;
 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
@@ -77,6 +78,9 @@ import org.xml.sax.helpers.AttributesImpl;
 public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
 
 
+    static {
+        ExtractorFactory.setAllThreadsPreferEventExtractors(true);
+    }
 
     static final String RELATION_AUDIO = 
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/audio";;
     static final String RELATION_MEDIA = 
"http://schemas.microsoft.com/office/2007/relationships/media";;
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.extractor.EmbeddedStreamTranslator
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.extractor.EmbeddedStreamTranslator
new file mode 100644
index 0000000..e59cba8
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.extractor.EmbeddedStreamTranslator
@@ -0,0 +1,15 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+org.apache.tika.extractor.microsoft.MSEmbeddedStreamTranslator
\ No newline at end of file
diff --git a/tika-server/tika-server-classic/pom.xml 
b/tika-server/tika-server-classic/pom.xml
index f263dde..68cae1e 100644
--- a/tika-server/tika-server-classic/pom.xml
+++ b/tika-server/tika-server-classic/pom.xml
@@ -101,7 +101,8 @@
                         </createDependencyReducedPom>
                         <artifactSet>
                             <excludes>
-                                
<exclude>org.apache.tika:tika-parsers-classic:jar:</exclude>
+                                
<exclude>org.apache.tika:tika-parsers-classic-package:jar:</exclude>
+                                
<exclude>org.apache.tika:tika-server-core:jar:</exclude>
                             </excludes>
                         </artifactSet>
                         <filters>
diff --git a/tika-server/tika-server-classic/src/TODO 
b/tika-server/tika-server-classic/src/TODO
deleted file mode 100644
index 6c605de..0000000
--- a/tika-server/tika-server-classic/src/TODO
+++ /dev/null
@@ -1,8 +0,0 @@
-this is needed by poi-ooxml
-    static {
-        ExtractorFactory.setAllThreadsPreferEventExtractors(true);
-    }
-
-figure out what to do with the xmp writer
-
-figure out what to do with the unpacker
diff --git 
a/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/resource/XMPMetadataResource.java
 
b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/resource/XMPMetadataResource.java
new file mode 100644
index 0000000..bcf4869
--- /dev/null
+++ 
b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/resource/XMPMetadataResource.java
@@ -0,0 +1,49 @@
+package org.apache.tika.server.classic.resource;
+
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.server.core.resource.MetadataResource;
+import org.apache.tika.server.core.resource.TikaResource;
+import org.apache.tika.server.core.resource.TikaServerResource;
+
+import javax.ws.rs.Consumes;
+import javax.ws.rs.POST;
+import javax.ws.rs.PUT;
+import javax.ws.rs.Path;
+import javax.ws.rs.PathParam;
+import javax.ws.rs.Produces;
+import javax.ws.rs.core.Context;
+import javax.ws.rs.core.HttpHeaders;
+import javax.ws.rs.core.Response;
+import javax.ws.rs.core.UriInfo;
+import java.io.InputStream;
+
+public class XMPMetadataResource extends MetadataResource implements 
TikaServerResource {
+
+    @PUT
+    @Path("{field}")
+    @Produces({"application/rdf+xml"})
+    @Override
+    public Response getMetadataField(InputStream is, @Context HttpHeaders 
httpHeaders,
+                                     @Context UriInfo info, 
@PathParam("field") String field) throws Exception {
+        return super.getMetadataField(is, httpHeaders, info, field);
+    }
+
+    @POST
+    @Consumes("multipart/form-data")
+    @Produces({"application/rdf+xml"})
+    @Path("form")
+    public Response getMetadataFromMultipart(Attachment att, @Context UriInfo 
info) throws Exception {
+        return Response.ok(
+                parseMetadata(att.getObject(InputStream.class), new Metadata(),
+                        att.getHeaders(), info)).build();
+    }
+
+    @PUT
+    @Produces({"application/rdf+xml"})
+    public Response getMetadata(InputStream is, @Context HttpHeaders 
httpHeaders, @Context UriInfo info) throws Exception {
+        Metadata metadata = new Metadata();
+        return Response.ok(
+                parseMetadata(TikaResource.getInputStream(is, metadata, 
httpHeaders), metadata, httpHeaders.getRequestHeaders(), info)).build();
+    }
+}
diff --git 
a/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/writer/XMPMessageBodyWriter.java
 
b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/writer/XMPMessageBodyWriter.java
index a98829b..88dcbf3 100644
--- 
a/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/writer/XMPMessageBodyWriter.java
+++ 
b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/writer/XMPMessageBodyWriter.java
@@ -21,7 +21,6 @@ import javax.ws.rs.Produces;
 import javax.ws.rs.WebApplicationException;
 import javax.ws.rs.core.MediaType;
 import javax.ws.rs.core.MultivaluedMap;
-import javax.ws.rs.ext.MessageBodyWriter;
 import javax.ws.rs.ext.Provider;
 
 import java.io.IOException;
@@ -33,13 +32,14 @@ import java.lang.reflect.Type;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.server.core.writer.TikaServerWriter;
 import org.apache.tika.xmp.XMPMetadata;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
 
 @Provider
 @Produces("application/rdf+xml")
-public class XMPMessageBodyWriter implements MessageBodyWriter<Metadata> {
+public class XMPMessageBodyWriter implements TikaServerWriter<Metadata> {
 
     private static MediaType RDF_XML = 
MediaType.valueOf("application/rdf+xml");
 
diff --git 
a/tika-server/tika-server-classic/src/main/resources/META-INF/services/org.apache.tika.server.core.resource.TikaServerResource
 
b/tika-server/tika-server-classic/src/main/resources/META-INF/services/org.apache.tika.server.core.resource.TikaServerResource
new file mode 100644
index 0000000..0940048
--- /dev/null
+++ 
b/tika-server/tika-server-classic/src/main/resources/META-INF/services/org.apache.tika.server.core.resource.TikaServerResource
@@ -0,0 +1,15 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+org.apache.tika.server.classic.resource.XMPMetadataResource
\ No newline at end of file
diff --git 
a/tika-server/tika-server-classic/src/main/resources/META-INF/services/org.apache.tika.server.core.writer.TikaServerWriter
 
b/tika-server/tika-server-classic/src/main/resources/META-INF/services/org.apache.tika.server.core.writer.TikaServerWriter
new file mode 100644
index 0000000..77f7ca1
--- /dev/null
+++ 
b/tika-server/tika-server-classic/src/main/resources/META-INF/services/org.apache.tika.server.core.writer.TikaServerWriter
@@ -0,0 +1,15 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+org.apache.tika.server.classic.writer.XMPMessageBodyWriter
\ No newline at end of file
diff --git 
a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/MetadataResourceTest.java
 
b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/MetadataResourceTest.java
index 31056a0..7319d24 100644
--- 
a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/MetadataResourceTest.java
+++ 
b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/MetadataResourceTest.java
@@ -40,6 +40,7 @@ import 
org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.serialization.JsonMetadata;
+import org.apache.tika.server.classic.resource.XMPMetadataResource;
 import org.apache.tika.server.core.CXFTestBase;
 import org.apache.tika.server.core.resource.MetadataResource;
 import org.apache.tika.server.core.writer.CSVMessageBodyWriter;
@@ -55,9 +56,11 @@ public class MetadataResourceTest extends CXFTestBase {
 
     @Override
     protected void setUpResources(JAXRSServerFactoryBean sf) {
-        sf.setResourceClasses(MetadataResource.class);
+        sf.setResourceClasses(MetadataResource.class, 
XMPMetadataResource.class);
         sf.setResourceProvider(MetadataResource.class,
                 new SingletonResourceProvider(new MetadataResource()));
+        sf.setResourceProvider(XMPMetadataResource.class,
+                new SingletonResourceProvider(new XMPMetadataResource()));
     }
 
     @Override
@@ -163,7 +166,6 @@ public class MetadataResourceTest extends CXFTestBase {
     }
 
     @Test
-    @Ignore("TODO -- add back in xmp writer")
     public void testXMP() throws Exception {
         Response response = WebClient
                 .create(endPoint + META_PATH)
diff --git 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerCli.java
 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerCli.java
index d0b27df..805bf13 100644
--- 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerCli.java
+++ 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerCli.java
@@ -22,6 +22,7 @@ import java.io.InputStream;
 import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
@@ -42,6 +43,7 @@ import 
org.apache.cxf.rs.security.cors.CrossOriginResourceSharingFilter;
 import org.apache.cxf.transport.common.gzip.GZIPInInterceptor;
 import org.apache.cxf.transport.common.gzip.GZIPOutInterceptor;
 import org.apache.tika.Tika;
+import org.apache.tika.config.ServiceLoader;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.parser.DigestingParser;
 import org.apache.tika.parser.digestutils.BouncyCastleDigester;
@@ -54,6 +56,7 @@ import org.apache.tika.server.core.resource.TikaDetectors;
 import org.apache.tika.server.core.resource.TikaMimeTypes;
 import org.apache.tika.server.core.resource.TikaParsers;
 import org.apache.tika.server.core.resource.TikaResource;
+import org.apache.tika.server.core.resource.TikaServerResource;
 import org.apache.tika.server.core.resource.TikaServerStatus;
 import org.apache.tika.server.core.resource.TikaVersion;
 import org.apache.tika.server.core.resource.TikaWelcome;
@@ -313,6 +316,7 @@ public class TikaServerCli {
             rCoreProviders.add(new SingletonResourceProvider(new 
TikaDetectors()));
             rCoreProviders.add(new SingletonResourceProvider(new 
TikaParsers()));
             rCoreProviders.add(new SingletonResourceProvider(new 
TikaVersion()));
+            rCoreProviders.addAll(loadResourceServices());
             if (line.hasOption("status")) {
                 rCoreProviders.add(new SingletonResourceProvider(new 
TikaServerStatus(serverStatus)));
             }
@@ -326,8 +330,8 @@ public class TikaServerCli {
             providers.add(new CSVMessageBodyWriter());
             providers.add(new MetadataListMessageBodyWriter());
             providers.add(new JSONMessageBodyWriter());
-            //providers.add(new XMPMessageBodyWriter());
             providers.add(new TextMessageBodyWriter());
+            providers.addAll(loadWriterServices());
             providers.add(new 
TikaServerParseExceptionMapper(returnStackTrace));
             if (line.hasOption("status")) {
                 providers.add(new JSONObjWriter());
@@ -357,6 +361,22 @@ public class TikaServerCli {
             LOG.info("Started Apache Tika server at {}", url);
     }
 
+    private static Collection<? extends ResourceProvider> 
loadResourceServices() {
+        List<TikaServerResource> resources = new 
ServiceLoader(TikaServerCli.class.getClassLoader())
+                .loadServiceProviders(TikaServerResource.class);
+        List<ResourceProvider> providers = new ArrayList<>();
+
+        for (TikaServerResource r : resources) {
+            providers.add(new SingletonResourceProvider(r));
+        }
+        return providers;
+    }
+
+    private static Collection<?> loadWriterServices() {
+        return new ServiceLoader(TikaServerCli.class.getClassLoader())
+                
.loadServiceProviders(org.apache.tika.server.core.writer.TikaServerWriter.class);
+    }
+
     private static void usage(Options options) {
         HelpFormatter helpFormatter = new HelpFormatter();
         helpFormatter.printHelp("tikaserver", options);
diff --git 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java
 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java
index 7587bf7..8668b16 100644
--- 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java
+++ 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java
@@ -123,8 +123,8 @@ public class MetadataResource {
         return Response.ok(metadata).build();
     }
 
-    private Metadata parseMetadata(InputStream is, Metadata metadata,
-                                   MultivaluedMap<String, String> httpHeaders, 
UriInfo info) throws IOException {
+    protected Metadata parseMetadata(InputStream is, Metadata metadata,
+                                     MultivaluedMap<String, String> 
httpHeaders, UriInfo info) throws IOException {
         final ParseContext context = new ParseContext();
         Parser parser = TikaResource.createParser();
         fillMetadata(parser, metadata, httpHeaders);
diff --git 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaServerResource.java
 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaServerResource.java
new file mode 100644
index 0000000..e4b97d3
--- /dev/null
+++ 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaServerResource.java
@@ -0,0 +1,4 @@
+package org.apache.tika.server.core.resource;
+
+public interface TikaServerResource {
+}
diff --git 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
index 8b674c8..30860b1 100644
--- 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
+++ 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
@@ -47,7 +47,9 @@ import org.apache.commons.io.IOExceptionWithCause;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.mutable.MutableInt;
 import org.apache.tika.exception.TikaMemoryLimitException;
+import org.apache.tika.extractor.DefaultEmbeddedStreamTranslator;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedStreamTranslator;
 import org.apache.tika.io.BoundedInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -163,6 +165,7 @@ public class UnpackerResource {
     private class MyEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtractor {
         private final MutableInt count;
         private final Map<String, byte[]> zout;
+        private final EmbeddedStreamTranslator embeddedStreamTranslator = new 
DefaultEmbeddedStreamTranslator();
 
         MyEmbeddedDocumentExtractor(MutableInt count, Map<String, byte[]> 
zout) {
             this.count = count;
@@ -202,54 +205,20 @@ public class UnpackerResource {
                     LOG.warn("Unexpected MimeTypeException", e);
                 }
             }
-
-            if 
("application/vnd.openxmlformats-officedocument.oleObject".equals(contentType)) 
{
-                /*POIFSFileSystem poifs = new POIFSFileSystem(new 
ByteArrayInputStream(data));
-                OfficeParser.POIFSDocumentType type = 
OfficeParser.POIFSDocumentType.detectType(poifs);
-
-                if (type == OfficeParser.POIFSDocumentType.OLE10_NATIVE) {
-                    try {
-                        Ole10Native ole = 
Ole10Native.createFromEmbeddedOleObject(poifs);
-                        if (ole.getDataSize() > 0) {
-                            String label = ole.getLabel();
-
-                            if (label.startsWith("ole-")) {
-                                label = Integer.toString(count.intValue()) + 
'-' + label;
-                            }
-
-                            name = label;
-
-                            data = ole.getDataBuffer();
-                        }
-                    } catch (Ole10NativeException ex) {
-                        LOG.warn("Skipping invalid part", ex);
-                    }
-                } else {
-                    name += '.' + type.getExtension();
-                }*/
+            try (InputStream is = new ByteArrayInputStream(data)) {
+                if (embeddedStreamTranslator.shouldTranslate(is, metadata)) {
+                    InputStream translated = 
embeddedStreamTranslator.translate(new ByteArrayInputStream(data), metadata);
+                    ByteArrayOutputStream bos2 = new ByteArrayOutputStream();
+                    IOUtils.copy(translated, bos2);
+                    data = bos2.toByteArray();
+                }
             }
 
             final String finalName = getFinalName(name, zout);
 
             if (data.length > 0) {
                 zout.put(finalName, data);
-
                 count.increment();
-            } else {
-                /*
-                if (inputStream instanceof TikaInputStream) {
-                    TikaInputStream tin = (TikaInputStream) inputStream;
-
-                    if (tin.getOpenContainer() != null && 
tin.getOpenContainer() instanceof DirectoryEntry) {
-                        POIFSFileSystem fs = new POIFSFileSystem();
-                        copy((DirectoryEntry) tin.getOpenContainer(), 
fs.getRoot());
-                        ByteArrayOutputStream bos2 = new 
ByteArrayOutputStream();
-                        fs.writeFilesystem(bos2);
-                        bos2.close();
-
-                        zout.put(finalName, bos2.toByteArray());
-                    }
-                }*/
             }
         }
 
diff --git 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/writer/TikaServerWriter.java
 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/writer/TikaServerWriter.java
new file mode 100644
index 0000000..3fa35d0
--- /dev/null
+++ 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/writer/TikaServerWriter.java
@@ -0,0 +1,10 @@
+package org.apache.tika.server.core.writer;
+
+import javax.ws.rs.ext.MessageBodyWriter;
+
+/**
+ * stub interface to allow for SPI loading from other modules
+ * without opening up service loading to any generic MessageBodyWriter
+ */
+public interface TikaServerWriter<T> extends MessageBodyWriter<T> {
+}

Reply via email to