This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3180
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-3180 by this push:
new cdd0bf1 TIKA-3180 -- modularize tika-server
cdd0bf1 is described below
commit cdd0bf135b9d725bb04f1a9736b9df8d0e57db21
Author: tallison <[email protected]>
AuthorDate: Wed Dec 16 11:39:33 2020 -0500
TIKA-3180 -- modularize tika-server
---
CHANGES.txt | 3 +
tika-app/pom.xml | 2 +-
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 32 ++-----
.../extractor/DefaultEmbeddedStreamTranslator.java | 87 ++++++++++++++++++
.../tika/extractor/EmbeddedStreamTranslator.java | 38 ++++++++
.../microsoft/MSEmbeddedStreamTranslator.java | 101 +++++++++++++++++++++
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 4 +
....apache.tika.extractor.EmbeddedStreamTranslator | 15 +++
tika-server/tika-server-classic/pom.xml | 3 +-
tika-server/tika-server-classic/src/TODO | 8 --
.../classic/resource/XMPMetadataResource.java | 49 ++++++++++
.../classic/writer/XMPMessageBodyWriter.java | 4 +-
...he.tika.server.core.resource.TikaServerResource | 15 +++
...apache.tika.server.core.writer.TikaServerWriter | 15 +++
.../tika/server/classic/MetadataResourceTest.java | 6 +-
.../org/apache/tika/server/core/TikaServerCli.java | 22 ++++-
.../server/core/resource/MetadataResource.java | 4 +-
.../server/core/resource/TikaServerResource.java | 4 +
.../server/core/resource/UnpackerResource.java | 51 ++---------
.../tika/server/core/writer/TikaServerWriter.java | 10 ++
20 files changed, 389 insertions(+), 84 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 49a7ccc..0d61915 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -11,6 +11,9 @@ Release 2.0.0 - ???
* General code cleanup (PeterAlfredLee)
+ * tika-server's /metadata endpoint requires tika-server-classic to write
XMP/rdf output.
+ This output is not available in tika-server-core.
+
Other changes
Release 1.26 - ???
diff --git a/tika-app/pom.xml b/tika-app/pom.xml
index 461c45d..14525ef 100644
--- a/tika-app/pom.xml
+++ b/tika-app/pom.xml
@@ -112,7 +112,7 @@
</createDependencyReducedPom>
<artifactSet>
<excludes>
- <exclude>org.apache.tika:tika-parsers:jar:</exclude>
+
<exclude>org.apache.tika:tika-parsers-classic-package:jar:</exclude>
</excludes>
</artifactSet>
<filters>
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index e2557e4..b246207 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -74,7 +74,9 @@ import org.apache.tika.config.TikaConfigSerializer;
import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.DefaultEmbeddedStreamTranslator;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedStreamTranslator;
import org.apache.tika.fork.ForkParser;
import org.apache.tika.gui.TikaGUI;
import org.apache.tika.io.TikaInputStream;
@@ -1041,6 +1043,7 @@ public class TikaCLI {
private int count = 0;
private final TikaConfig config = TikaConfig.getDefaultConfig();
+ private final EmbeddedStreamTranslator embeddedStreamTranslator = new
DefaultEmbeddedStreamTranslator();
public boolean shouldParseEmbedded(Metadata metadata) {
return true;
@@ -1070,15 +1073,9 @@ public class TikaCLI {
System.out.println("Extracting '"+name+"' ("+contentType+") to " +
outputFile);
try (FileOutputStream os = new FileOutputStream(outputFile)) {
- if (inputStream instanceof TikaInputStream) {
- TikaInputStream tin = (TikaInputStream) inputStream;
-
- if (tin.getOpenContainer() != null &&
tin.getOpenContainer() instanceof DirectoryEntry) {
- POIFSFileSystem fs = new POIFSFileSystem();
- copy((DirectoryEntry) tin.getOpenContainer(),
fs.getRoot());
- fs.writeFilesystem(os);
- } else {
- IOUtils.copy(inputStream, os);
+ if (embeddedStreamTranslator.shouldTranslate(inputStream,
metadata)) {
+ try (InputStream translated =
embeddedStreamTranslator.translate(inputStream, metadata)) {
+ IOUtils.copy(translated, os);
}
} else {
IOUtils.copy(inputStream, os);
@@ -1148,23 +1145,6 @@ public class TikaCLI {
return ".bin";
}
-
- protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
- throws IOException {
- for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) {
- if (entry instanceof DirectoryEntry) {
- // Need to recurse
- DirectoryEntry newDir =
destDir.createDirectory(entry.getName());
- copy((DirectoryEntry) entry, newDir);
- } else {
- // Copy entry
- try (InputStream contents =
- new DocumentInputStream((DocumentEntry) entry)) {
- destDir.createDocument(entry.getName(), contents);
- }
- }
- }
- }
}
private class NoDocumentMetHandler extends DefaultHandler {
diff --git
a/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
new file mode 100644
index 0000000..86af6c1
--- /dev/null
+++
b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.extractor;
+
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.utils.ServiceLoaderUtils;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+/**
+ * Loads EmbeddedStreamTranslators via service loading. Tries to run each
+ * in turn and returns the first non-null value. If no translation has
occurred,
+ * this returns the original InputStream. If a translation has occurred, the
+ * translator will consume the InputStream but not close it.
+ */
+public class DefaultEmbeddedStreamTranslator implements
EmbeddedStreamTranslator {
+
+ final List<EmbeddedStreamTranslator> translators;
+
+ private static List<EmbeddedStreamTranslator>
getDefaultFilters(ServiceLoader loader) {
+ List<EmbeddedStreamTranslator> embeddedStreamTranslators
+ = loader.loadServiceProviders(EmbeddedStreamTranslator.class);
+ ServiceLoaderUtils.sortLoadedClasses(embeddedStreamTranslators);
+ return embeddedStreamTranslators;
+ }
+
+ public DefaultEmbeddedStreamTranslator() {
+ this(getDefaultFilters(new ServiceLoader()));
+ }
+
+ private DefaultEmbeddedStreamTranslator(List<EmbeddedStreamTranslator>
translators) {
+ this.translators = translators;
+ }
+
+ /**
+ * This should sniff the stream to determine if it needs to be translated.
+ * The translator is responsible for resetting the stream if any bytes
have been read.
+ * @param inputStream
+ * @param metadata
+ * @return
+ * @throws IOException
+ */
+ @Override
+ public boolean shouldTranslate(InputStream inputStream, Metadata metadata)
throws IOException {
+ for (EmbeddedStreamTranslator translator : translators) {
+ if (translator.shouldTranslate(inputStream, metadata)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * This will consume the InputStream and return a new stream of translated
bytes.
+ * @param inputStream
+ * @param metadata
+ * @return
+ * @throws IOException
+ */
+ @Override
+ public InputStream translate(InputStream inputStream, Metadata metadata)
throws IOException {
+ for (EmbeddedStreamTranslator translator : translators) {
+ InputStream translated = translator.translate(inputStream,
metadata);
+ if (translated != null) {
+ return translated;
+ }
+ }
+ return inputStream;
+ }
+}
diff --git
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
new file mode 100644
index 0000000..c6387fe
--- /dev/null
+++
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.extractor;
+
+import org.apache.tika.metadata.Metadata;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Interface for different filtering of embedded streams.
+ * Specifically, unravel OLE streams in tika-server unpack,
+ * and/or handle open containers in TikaInputStream
+ *
+ * @since Apache Tika 2.0.0
+ */
+public interface EmbeddedStreamTranslator {
+
+ boolean shouldTranslate(InputStream inputStream, Metadata metadata) throws
IOException;
+
+ InputStream translate(InputStream inputStream,
+ Metadata metadata) throws IOException;
+
+}
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
new file mode 100644
index 0000000..f8d157d
--- /dev/null
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
@@ -0,0 +1,101 @@
+package org.apache.tika.extractor.microsoft;
+
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.extractor.EmbeddedStreamTranslator;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+public class MSEmbeddedStreamTranslator implements EmbeddedStreamTranslator {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(MSEmbeddedStreamTranslator.class);
+
+ @Override
+ public boolean shouldTranslate(InputStream inputStream, Metadata metadata)
throws IOException {
+ String contentType =
metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
+ if
("application/vnd.openxmlformats-officedocument.oleObject".equals(contentType))
{
+ return true;
+ } else if (inputStream instanceof TikaInputStream) {
+ TikaInputStream tin = (TikaInputStream) inputStream;
+ if (tin.getOpenContainer() != null && tin.getOpenContainer()
instanceof DirectoryEntry) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public InputStream translate(InputStream inputStream, Metadata metadata)
throws IOException {
+ String contentType =
metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
+ if
("application/vnd.openxmlformats-officedocument.oleObject".equals(contentType))
{
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ IOUtils.copy(inputStream, bos);
+ byte[] data = bos.toByteArray();
+ POIFSFileSystem poifs = new POIFSFileSystem(new
ByteArrayInputStream(data));
+ OfficeParser.POIFSDocumentType type =
OfficeParser.POIFSDocumentType.detectType(poifs);
+ String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+
+ if (type == OfficeParser.POIFSDocumentType.OLE10_NATIVE) {
+ try {
+ Ole10Native ole =
Ole10Native.createFromEmbeddedOleObject(poifs);
+ if (ole.getDataSize() > 0) {
+ String label = ole.getLabel();
+
+ name = label;
+
+ data = ole.getDataBuffer();
+ }
+ } catch (Ole10NativeException ex) {
+ LOG.warn("Skipping invalid part", ex);
+ }
+ } else {
+ name += '.' + type.getExtension();
+ }
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+ return new ByteArrayInputStream(data);
+ } else if (inputStream instanceof TikaInputStream) {
+ TikaInputStream tin = (TikaInputStream) inputStream;
+
+ if (tin.getOpenContainer() != null && tin.getOpenContainer()
instanceof DirectoryEntry) {
+ POIFSFileSystem fs = new POIFSFileSystem();
+ copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
+ ByteArrayOutputStream bos2 = new ByteArrayOutputStream();
+ fs.writeFilesystem(bos2);
+ bos2.close();
+ return new ByteArrayInputStream(bos2.toByteArray());
+ }
+ }
+ return inputStream;
+ }
+
+ protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
+ throws IOException {
+ for (Entry entry : sourceDir) {
+ if (entry instanceof DirectoryEntry) {
+ // Need to recurse
+ DirectoryEntry newDir =
destDir.createDirectory(entry.getName());
+ copy((DirectoryEntry) entry, newDir);
+ } else {
+ // Copy entry
+ try (InputStream contents = new
DocumentInputStream((DocumentEntry) entry)) {
+ destDir.createDocument(entry.getName(), contents);
+ }
+ }
+ }
+ }
+}
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 4ecebfb..6ab763c 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -31,6 +31,7 @@ import java.util.Set;
import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.ooxml.POIXMLDocument;
+import org.apache.poi.ooxml.extractor.ExtractorFactory;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
@@ -77,6 +78,9 @@ import org.xml.sax.helpers.AttributesImpl;
public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
+ static {
+ ExtractorFactory.setAllThreadsPreferEventExtractors(true);
+ }
static final String RELATION_AUDIO =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/audio";
static final String RELATION_MEDIA =
"http://schemas.microsoft.com/office/2007/relationships/media";
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.extractor.EmbeddedStreamTranslator
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.extractor.EmbeddedStreamTranslator
new file mode 100644
index 0000000..e59cba8
--- /dev/null
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.extractor.EmbeddedStreamTranslator
@@ -0,0 +1,15 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+org.apache.tika.extractor.microsoft.MSEmbeddedStreamTranslator
\ No newline at end of file
diff --git a/tika-server/tika-server-classic/pom.xml
b/tika-server/tika-server-classic/pom.xml
index f263dde..68cae1e 100644
--- a/tika-server/tika-server-classic/pom.xml
+++ b/tika-server/tika-server-classic/pom.xml
@@ -101,7 +101,8 @@
</createDependencyReducedPom>
<artifactSet>
<excludes>
-
<exclude>org.apache.tika:tika-parsers-classic:jar:</exclude>
+
<exclude>org.apache.tika:tika-parsers-classic-package:jar:</exclude>
+
<exclude>org.apache.tika:tika-server-core:jar:</exclude>
</excludes>
</artifactSet>
<filters>
diff --git a/tika-server/tika-server-classic/src/TODO
b/tika-server/tika-server-classic/src/TODO
deleted file mode 100644
index 6c605de..0000000
--- a/tika-server/tika-server-classic/src/TODO
+++ /dev/null
@@ -1,8 +0,0 @@
-this is needed by poi-ooxml
- static {
- ExtractorFactory.setAllThreadsPreferEventExtractors(true);
- }
-
-figure out what to do with the xmp writer
-
-figure out what to do with the unpacker
diff --git
a/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/resource/XMPMetadataResource.java
b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/resource/XMPMetadataResource.java
new file mode 100644
index 0000000..bcf4869
--- /dev/null
+++
b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/resource/XMPMetadataResource.java
@@ -0,0 +1,49 @@
+package org.apache.tika.server.classic.resource;
+
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.server.core.resource.MetadataResource;
+import org.apache.tika.server.core.resource.TikaResource;
+import org.apache.tika.server.core.resource.TikaServerResource;
+
+import javax.ws.rs.Consumes;
+import javax.ws.rs.POST;
+import javax.ws.rs.PUT;
+import javax.ws.rs.Path;
+import javax.ws.rs.PathParam;
+import javax.ws.rs.Produces;
+import javax.ws.rs.core.Context;
+import javax.ws.rs.core.HttpHeaders;
+import javax.ws.rs.core.Response;
+import javax.ws.rs.core.UriInfo;
+import java.io.InputStream;
+
+public class XMPMetadataResource extends MetadataResource implements
TikaServerResource {
+
+ @PUT
+ @Path("{field}")
+ @Produces({"application/rdf+xml"})
+ @Override
+ public Response getMetadataField(InputStream is, @Context HttpHeaders
httpHeaders,
+ @Context UriInfo info,
@PathParam("field") String field) throws Exception {
+ return super.getMetadataField(is, httpHeaders, info, field);
+ }
+
+ @POST
+ @Consumes("multipart/form-data")
+ @Produces({"application/rdf+xml"})
+ @Path("form")
+ public Response getMetadataFromMultipart(Attachment att, @Context UriInfo
info) throws Exception {
+ return Response.ok(
+ parseMetadata(att.getObject(InputStream.class), new Metadata(),
+ att.getHeaders(), info)).build();
+ }
+
+ @PUT
+ @Produces({"application/rdf+xml"})
+ public Response getMetadata(InputStream is, @Context HttpHeaders
httpHeaders, @Context UriInfo info) throws Exception {
+ Metadata metadata = new Metadata();
+ return Response.ok(
+ parseMetadata(TikaResource.getInputStream(is, metadata,
httpHeaders), metadata, httpHeaders.getRequestHeaders(), info)).build();
+ }
+}
diff --git
a/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/writer/XMPMessageBodyWriter.java
b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/writer/XMPMessageBodyWriter.java
index a98829b..88dcbf3 100644
---
a/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/writer/XMPMessageBodyWriter.java
+++
b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/writer/XMPMessageBodyWriter.java
@@ -21,7 +21,6 @@ import javax.ws.rs.Produces;
import javax.ws.rs.WebApplicationException;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.MultivaluedMap;
-import javax.ws.rs.ext.MessageBodyWriter;
import javax.ws.rs.ext.Provider;
import java.io.IOException;
@@ -33,13 +32,14 @@ import java.lang.reflect.Type;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.server.core.writer.TikaServerWriter;
import org.apache.tika.xmp.XMPMetadata;
import static java.nio.charset.StandardCharsets.UTF_8;
@Provider
@Produces("application/rdf+xml")
-public class XMPMessageBodyWriter implements MessageBodyWriter<Metadata> {
+public class XMPMessageBodyWriter implements TikaServerWriter<Metadata> {
private static MediaType RDF_XML =
MediaType.valueOf("application/rdf+xml");
diff --git
a/tika-server/tika-server-classic/src/main/resources/META-INF/services/org.apache.tika.server.core.resource.TikaServerResource
b/tika-server/tika-server-classic/src/main/resources/META-INF/services/org.apache.tika.server.core.resource.TikaServerResource
new file mode 100644
index 0000000..0940048
--- /dev/null
+++
b/tika-server/tika-server-classic/src/main/resources/META-INF/services/org.apache.tika.server.core.resource.TikaServerResource
@@ -0,0 +1,15 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+org.apache.tika.server.classic.resource.XMPMetadataResource
\ No newline at end of file
diff --git
a/tika-server/tika-server-classic/src/main/resources/META-INF/services/org.apache.tika.server.core.writer.TikaServerWriter
b/tika-server/tika-server-classic/src/main/resources/META-INF/services/org.apache.tika.server.core.writer.TikaServerWriter
new file mode 100644
index 0000000..77f7ca1
--- /dev/null
+++
b/tika-server/tika-server-classic/src/main/resources/META-INF/services/org.apache.tika.server.core.writer.TikaServerWriter
@@ -0,0 +1,15 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+org.apache.tika.server.classic.writer.XMPMessageBodyWriter
\ No newline at end of file
diff --git
a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/MetadataResourceTest.java
b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/MetadataResourceTest.java
index 31056a0..7319d24 100644
---
a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/MetadataResourceTest.java
+++
b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/MetadataResourceTest.java
@@ -40,6 +40,7 @@ import
org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.serialization.JsonMetadata;
+import org.apache.tika.server.classic.resource.XMPMetadataResource;
import org.apache.tika.server.core.CXFTestBase;
import org.apache.tika.server.core.resource.MetadataResource;
import org.apache.tika.server.core.writer.CSVMessageBodyWriter;
@@ -55,9 +56,11 @@ public class MetadataResourceTest extends CXFTestBase {
@Override
protected void setUpResources(JAXRSServerFactoryBean sf) {
- sf.setResourceClasses(MetadataResource.class);
+ sf.setResourceClasses(MetadataResource.class,
XMPMetadataResource.class);
sf.setResourceProvider(MetadataResource.class,
new SingletonResourceProvider(new MetadataResource()));
+ sf.setResourceProvider(XMPMetadataResource.class,
+ new SingletonResourceProvider(new XMPMetadataResource()));
}
@Override
@@ -163,7 +166,6 @@ public class MetadataResourceTest extends CXFTestBase {
}
@Test
- @Ignore("TODO -- add back in xmp writer")
public void testXMP() throws Exception {
Response response = WebClient
.create(endPoint + META_PATH)
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerCli.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerCli.java
index d0b27df..805bf13 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerCli.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerCli.java
@@ -22,6 +22,7 @@ import java.io.InputStream;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
@@ -42,6 +43,7 @@ import
org.apache.cxf.rs.security.cors.CrossOriginResourceSharingFilter;
import org.apache.cxf.transport.common.gzip.GZIPInInterceptor;
import org.apache.cxf.transport.common.gzip.GZIPOutInterceptor;
import org.apache.tika.Tika;
+import org.apache.tika.config.ServiceLoader;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.digestutils.BouncyCastleDigester;
@@ -54,6 +56,7 @@ import org.apache.tika.server.core.resource.TikaDetectors;
import org.apache.tika.server.core.resource.TikaMimeTypes;
import org.apache.tika.server.core.resource.TikaParsers;
import org.apache.tika.server.core.resource.TikaResource;
+import org.apache.tika.server.core.resource.TikaServerResource;
import org.apache.tika.server.core.resource.TikaServerStatus;
import org.apache.tika.server.core.resource.TikaVersion;
import org.apache.tika.server.core.resource.TikaWelcome;
@@ -313,6 +316,7 @@ public class TikaServerCli {
rCoreProviders.add(new SingletonResourceProvider(new
TikaDetectors()));
rCoreProviders.add(new SingletonResourceProvider(new
TikaParsers()));
rCoreProviders.add(new SingletonResourceProvider(new
TikaVersion()));
+ rCoreProviders.addAll(loadResourceServices());
if (line.hasOption("status")) {
rCoreProviders.add(new SingletonResourceProvider(new
TikaServerStatus(serverStatus)));
}
@@ -326,8 +330,8 @@ public class TikaServerCli {
providers.add(new CSVMessageBodyWriter());
providers.add(new MetadataListMessageBodyWriter());
providers.add(new JSONMessageBodyWriter());
- //providers.add(new XMPMessageBodyWriter());
providers.add(new TextMessageBodyWriter());
+ providers.addAll(loadWriterServices());
providers.add(new
TikaServerParseExceptionMapper(returnStackTrace));
if (line.hasOption("status")) {
providers.add(new JSONObjWriter());
@@ -357,6 +361,22 @@ public class TikaServerCli {
LOG.info("Started Apache Tika server at {}", url);
}
+ private static Collection<? extends ResourceProvider>
loadResourceServices() {
+ List<TikaServerResource> resources = new
ServiceLoader(TikaServerCli.class.getClassLoader())
+ .loadServiceProviders(TikaServerResource.class);
+ List<ResourceProvider> providers = new ArrayList<>();
+
+ for (TikaServerResource r : resources) {
+ providers.add(new SingletonResourceProvider(r));
+ }
+ return providers;
+ }
+
+ private static Collection<?> loadWriterServices() {
+ return new ServiceLoader(TikaServerCli.class.getClassLoader())
+
.loadServiceProviders(org.apache.tika.server.core.writer.TikaServerWriter.class);
+ }
+
private static void usage(Options options) {
HelpFormatter helpFormatter = new HelpFormatter();
helpFormatter.printHelp("tikaserver", options);
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java
index 7587bf7..8668b16 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/MetadataResource.java
@@ -123,8 +123,8 @@ public class MetadataResource {
return Response.ok(metadata).build();
}
- private Metadata parseMetadata(InputStream is, Metadata metadata,
- MultivaluedMap<String, String> httpHeaders,
UriInfo info) throws IOException {
+ protected Metadata parseMetadata(InputStream is, Metadata metadata,
+ MultivaluedMap<String, String>
httpHeaders, UriInfo info) throws IOException {
final ParseContext context = new ParseContext();
Parser parser = TikaResource.createParser();
fillMetadata(parser, metadata, httpHeaders);
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaServerResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaServerResource.java
new file mode 100644
index 0000000..e4b97d3
--- /dev/null
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaServerResource.java
@@ -0,0 +1,4 @@
+package org.apache.tika.server.core.resource;
+
+public interface TikaServerResource {
+}
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
index 8b674c8..30860b1 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
@@ -47,7 +47,9 @@ import org.apache.commons.io.IOExceptionWithCause;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.tika.exception.TikaMemoryLimitException;
+import org.apache.tika.extractor.DefaultEmbeddedStreamTranslator;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedStreamTranslator;
import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -163,6 +165,7 @@ public class UnpackerResource {
private class MyEmbeddedDocumentExtractor implements
EmbeddedDocumentExtractor {
private final MutableInt count;
private final Map<String, byte[]> zout;
+ private final EmbeddedStreamTranslator embeddedStreamTranslator = new
DefaultEmbeddedStreamTranslator();
MyEmbeddedDocumentExtractor(MutableInt count, Map<String, byte[]>
zout) {
this.count = count;
@@ -202,54 +205,20 @@ public class UnpackerResource {
LOG.warn("Unexpected MimeTypeException", e);
}
}
-
- if
("application/vnd.openxmlformats-officedocument.oleObject".equals(contentType))
{
- /*POIFSFileSystem poifs = new POIFSFileSystem(new
ByteArrayInputStream(data));
- OfficeParser.POIFSDocumentType type =
OfficeParser.POIFSDocumentType.detectType(poifs);
-
- if (type == OfficeParser.POIFSDocumentType.OLE10_NATIVE) {
- try {
- Ole10Native ole =
Ole10Native.createFromEmbeddedOleObject(poifs);
- if (ole.getDataSize() > 0) {
- String label = ole.getLabel();
-
- if (label.startsWith("ole-")) {
- label = Integer.toString(count.intValue()) +
'-' + label;
- }
-
- name = label;
-
- data = ole.getDataBuffer();
- }
- } catch (Ole10NativeException ex) {
- LOG.warn("Skipping invalid part", ex);
- }
- } else {
- name += '.' + type.getExtension();
- }*/
+ try (InputStream is = new ByteArrayInputStream(data)) {
+ if (embeddedStreamTranslator.shouldTranslate(is, metadata)) {
+ InputStream translated =
embeddedStreamTranslator.translate(new ByteArrayInputStream(data), metadata);
+ ByteArrayOutputStream bos2 = new ByteArrayOutputStream();
+ IOUtils.copy(translated, bos2);
+ data = bos2.toByteArray();
+ }
}
final String finalName = getFinalName(name, zout);
if (data.length > 0) {
zout.put(finalName, data);
-
count.increment();
- } else {
- /*
- if (inputStream instanceof TikaInputStream) {
- TikaInputStream tin = (TikaInputStream) inputStream;
-
- if (tin.getOpenContainer() != null &&
tin.getOpenContainer() instanceof DirectoryEntry) {
- POIFSFileSystem fs = new POIFSFileSystem();
- copy((DirectoryEntry) tin.getOpenContainer(),
fs.getRoot());
- ByteArrayOutputStream bos2 = new
ByteArrayOutputStream();
- fs.writeFilesystem(bos2);
- bos2.close();
-
- zout.put(finalName, bos2.toByteArray());
- }
- }*/
}
}
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/writer/TikaServerWriter.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/writer/TikaServerWriter.java
new file mode 100644
index 0000000..3fa35d0
--- /dev/null
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/writer/TikaServerWriter.java
@@ -0,0 +1,10 @@
+package org.apache.tika.server.core.writer;
+
+import javax.ws.rs.ext.MessageBodyWriter;
+
+/**
+ * stub interface to allow for SPI loading from other modules
+ * without opening up service loading to any generic MessageBodyWriter
+ */
+public interface TikaServerWriter<T> extends MessageBodyWriter<T> {
+}