This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
commit 4bab6a885b48e1d2b41b2fb305b761145aa92fbc Author: tallison <[email protected]> AuthorDate: Thu Mar 19 10:03:54 2020 -0400 TIKA-3073 -- allow gz compression of input and output streams in tika-server --- CHANGES.txt | 4 ++ .../java/org/apache/tika/server/TikaServerCli.java | 9 ++++ .../apache/tika/server/resource/TikaResource.java | 1 - .../java/org/apache/tika/server/CXFTestBase.java | 26 ++++++++++- .../tika/server/RecursiveMetadataResourceTest.java | 42 +++++++++++++++++- .../org/apache/tika/server/TikaResourceTest.java | 51 ++++++++++++++++++++++ 6 files changed, 130 insertions(+), 3 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 4b1eddb..19d1985 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -5,6 +5,10 @@ Release 2.0.0 - ??? Other changes +Release 1.25 - ??? + + * Allow gzip compression of input and output streams for tika-server (TIKA-3073). + Release 1.24 - 3/11/2019 * Add scripts to run tika-server as a service via Eric Pugh, diff --git a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java index a049373..10616cd 100644 --- a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java +++ b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java @@ -22,6 +22,7 @@ import java.io.InputStream; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -37,6 +38,8 @@ import org.apache.cxf.jaxrs.JAXRSServerFactoryBean; import org.apache.cxf.jaxrs.lifecycle.ResourceProvider; import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider; import org.apache.cxf.rs.security.cors.CrossOriginResourceSharingFilter; +import org.apache.cxf.transport.common.gzip.GZIPInInterceptor; +import org.apache.cxf.transport.common.gzip.GZIPOutInterceptor; import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; import org.apache.tika.parser.DigestingParser; @@ -323,6 +326,12 @@ public class TikaServerCli { } sf.setProviders(providers); + //set compression interceptors + sf.setOutInterceptors( + Collections.singletonList(new GZIPOutInterceptor()) + ); + sf.setInInterceptors( + Collections.singletonList(new GZIPInInterceptor())); String url = "http://" + host + ":" + port + "/"; sf.setAddress(url); diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java index c5bfa8f..0275b7e 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java @@ -75,7 +75,6 @@ import java.io.Writer; import java.lang.reflect.Field; import java.lang.reflect.Method; import java.util.Locale; -import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; diff --git a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java index 32dd235..92c9d34 100644 --- a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java +++ b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java @@ -25,11 +25,15 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; +import java.util.ArrayList; +import java.util.Collections; import java.util.Enumeration; import java.util.HashMap; +import java.util.List; import java.util.Map; import org.apache.commons.codec.digest.DigestUtils; @@ -37,11 +41,16 @@ import org.apache.commons.compress.archivers.ArchiveEntry; import org.apache.commons.compress.archivers.ArchiveInputStream; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipFile; +import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; import org.apache.commons.io.IOUtils; import org.apache.cxf.binding.BindingFactoryManager; import org.apache.cxf.endpoint.Server; +import org.apache.cxf.interceptor.Interceptor; import org.apache.cxf.jaxrs.JAXRSBindingFactory; import org.apache.cxf.jaxrs.JAXRSServerFactoryBean; +import org.apache.cxf.message.Message; +import org.apache.cxf.transport.common.gzip.GZIPInInterceptor; +import org.apache.cxf.transport.common.gzip.GZIPOutInterceptor; import org.apache.tika.config.TikaConfig; import org.apache.tika.parser.utils.CommonsDigester; import org.apache.tika.server.resource.TikaResource; @@ -86,6 +95,13 @@ public abstract class CXFTestBase { new CommonsDigester(DIGESTER_READ_LIMIT, "md5,sha1:32"), new DefaultInputStreamFactory(), new ServerStatus(true)); JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean(); + //set compression interceptors + sf.setOutInterceptors( + Collections.singletonList(new GZIPOutInterceptor()) + ); + sf.setInInterceptors( + Collections.singletonList(new GZIPInInterceptor())); + setUpResources(sf); setUpProviders(sf); sf.setAddress(endPoint + "/"); @@ -101,7 +117,6 @@ public abstract class CXFTestBase { JAXRSBindingFactory.JAXRS_BINDING_ID, factory ); - server = sf.create(); } @@ -176,4 +191,13 @@ public abstract class CXFTestBase { return tmp; } + public static InputStream gzip(InputStream is) throws IOException { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + OutputStream gz = new GzipCompressorOutputStream(bos); + IOUtils.copy(is, gz); + gz.flush(); + gz.close(); + return new ByteArrayInputStream(bos.toByteArray()); + } + } diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java index ec7e389..b878f47 100644 --- a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java +++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java @@ -25,21 +25,25 @@ import static org.junit.Assert.assertTrue; import javax.ws.rs.core.Response; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.util.ArrayList; import java.util.List; +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; import org.apache.cxf.jaxrs.JAXRSServerFactoryBean; import org.apache.cxf.jaxrs.client.WebClient; import org.apache.cxf.jaxrs.ext.multipart.Attachment; import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider; +import org.apache.tika.io.IOUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.OfficeOpenXMLExtended; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.serialization.JsonMetadataList; -import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.RecursiveParserWrapperHandler; import org.apache.tika.server.resource.RecursiveMetadataResource; @@ -73,6 +77,42 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { } @Test + public void testGZOut() throws Exception { + Response response = WebClient + .create(endPoint + META_PATH) + .accept("application/json") + .acceptEncoding("gzip") + .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC)); + + Reader reader = new InputStreamReader(new GzipCompressorInputStream((InputStream) response.getEntity()), UTF_8); + List<Metadata> metadataList = JsonMetadataList.fromJson(reader); + assertEquals(12, metadataList.size()); + assertEquals("Microsoft Office Word", metadataList.get(0).get(OfficeOpenXMLExtended.APPLICATION)); + assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content")); + + assertEquals("a38e6c7b38541af87148dee9634cb811", metadataList.get(10).get("X-TIKA:digest:MD5")); + } + + @Test + public void testGZIn() throws Exception { + + Response response = WebClient + .create(endPoint + META_PATH) + .accept("application/json") + .encoding("gzip") + .put(gzip(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC))); + + Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8); + List<Metadata> metadataList = JsonMetadataList.fromJson(reader); + assertEquals(12, metadataList.size()); + assertEquals("Microsoft Office Word", metadataList.get(0).get(OfficeOpenXMLExtended.APPLICATION)); + assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content")); + + assertEquals("a38e6c7b38541af87148dee9634cb811", metadataList.get(10).get("X-TIKA:digest:MD5")); + + } + + @Test public void testSimpleWord() throws Exception { Response response = WebClient .create(endPoint + META_PATH) diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java index 3f65418..6b6fa23 100644 --- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java +++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java @@ -18,6 +18,8 @@ package org.apache.tika.server; import org.apache.commons.codec.binary.Base64; +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; +import org.apache.commons.io.IOUtils; import org.apache.cxf.attachment.AttachmentUtil; import org.apache.cxf.jaxrs.JAXRSServerFactoryBean; import org.apache.cxf.jaxrs.client.WebClient; @@ -30,12 +32,17 @@ import org.junit.Ignore; import org.junit.Test; import javax.ws.rs.ProcessingException; +import javax.ws.rs.core.MultivaluedMap; import javax.ws.rs.core.Response; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; +import java.util.Set; +import static org.apache.cxf.helpers.HttpHeaderHelper.CONTENT_ENCODING; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -85,6 +92,50 @@ public class TikaResourceTest extends CXFTestBase { } @Test + public void testWordGzipIn() throws Exception { + Response response = WebClient.create(endPoint + TIKA_PATH) + .type("application/msword") + .accept("text/plain") + .encoding("gzip") + .put(gzip(ClassLoader.getSystemResourceAsStream(TEST_DOC))); + String responseMsg = getStringFromInputStream((InputStream) response + .getEntity()); + assertTrue(responseMsg.contains("test")); + } + + @Test + public void testLongGzipOut() throws Exception { + //if the output is long enough, jax-rs will compress it, otherwise it won't + //this output is long enough, and should be compressed + Response response = WebClient.create(endPoint + TIKA_PATH) + .accept("text/plain") + .acceptEncoding("gzip") + .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC)); + assertTrue(response.getHeaders().containsKey(CONTENT_ENCODING)); + assertEquals("gzip", response.getHeaderString(CONTENT_ENCODING)); + String responseMsg = getStringFromInputStream( + new GzipCompressorInputStream((InputStream) response + .getEntity())); + assertTrue(responseMsg.contains("Course of human")); + } + + @Test + public void testShortGzipOut() throws Exception { + //if the output is long enough, jax-rs will compress it, otherwise it won't + //this output is short enough, and should not be compressed + Response response = WebClient.create(endPoint + TIKA_PATH) + .accept("text/plain") + .acceptEncoding("gzip") + .put(ClassLoader.getSystemResourceAsStream(TEST_DOC)); + assertFalse(response.getHeaders().containsKey(CONTENT_ENCODING)); + + String responseMsg = getStringFromInputStream( + (InputStream) response + .getEntity()); + assertTrue(responseMsg.contains("test")); + } + + @Test public void testTextMain() throws Exception { //boilerpipe Response response = WebClient.create(endPoint + TIKA_PATH + "/main")
