This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 2006dc5 TIKA-3352: Add json output for /tika endpoint in tika-server
2006dc5 is described below
commit 2006dc566c28f2655a8b7e625c49d6d1591e3d48
Author: tallison <[email protected]>
AuthorDate: Wed Apr 14 09:52:00 2021 -0400
TIKA-3352: Add json output for /tika endpoint in tika-server
---
CHANGES.txt | 2 +
.../java/org/apache/tika/server/TikaServerCli.java | 2 +-
.../server/resource/RecursiveMetadataResource.java | 4 +-
.../apache/tika/server/resource/TikaResource.java | 95 ++++++++++++++++++++-
.../java/org/apache/tika/server/CXFTestBase.java | 6 +-
.../tika/server/RecursiveMetadataResourceTest.java | 17 ++++
.../org/apache/tika/server/StackTraceTest.java | 17 +++-
.../server/TikaResourceMetadataFilterTest.java | 83 ++++++++++++++++++
.../tika/server/TikaResourceNoStackTest.java | 83 ++++++++++++++++++
.../org/apache/tika/server/TikaResourceTest.java | 98 ++++++++++++++++++++++
.../resources/configs/metadata-filter-include.xml | 30 +++++++
.../src/test/resources/mock/hello_world.xml | 26 ++++++
.../src/test/resources/mock/hello_world_long.xml | 30 +++++++
13 files changed, 486 insertions(+), 7 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 4c3464b..bc05fc7 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 1.27 - ???
+ * Add json output for /tika endpoint in tika-server (TIKA-3352).
+
* Tika's PDFParser should use the underlying file if one is passed in
via a TikaInputStream (TIKA-3350)
diff --git
a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
index 5b88a66..336ffb5 100644
--- a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
+++ b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
@@ -307,7 +307,7 @@ public class TikaServerCli {
} else {
serverStatus = new ServerStatus(serverId, 0, true);
}
- TikaResource.init(tika, digester, inputStreamFactory,
serverStatus);
+ TikaResource.init(tika, returnStackTrace, digester,
inputStreamFactory, serverStatus);
JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
List<ResourceProvider> rCoreProviders = new ArrayList<>();
diff --git
a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
index 71e7180..ec37779 100644
---
a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
+++
b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
@@ -45,8 +45,8 @@ import org.slf4j.LoggerFactory;
@Path("/rmeta")
public class RecursiveMetadataResource {
- private static final String HANDLER_TYPE_PARAM = "handler";
- private static final BasicContentHandlerFactory.HANDLER_TYPE
DEFAULT_HANDLER_TYPE =
+ protected static final String HANDLER_TYPE_PARAM = "handler";
+ protected static final BasicContentHandlerFactory.HANDLER_TYPE
DEFAULT_HANDLER_TYPE =
BasicContentHandlerFactory.HANDLER_TYPE.XML;
private static final Logger LOG =
LoggerFactory.getLogger(RecursiveMetadataResource.class);
diff --git
a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index 118d7c3..3425741 100644
---
a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++
b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -27,6 +27,7 @@ import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
@@ -40,12 +41,16 @@ import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.apache.tika.sax.RichTextContentHandler;
import org.apache.tika.server.InputStreamFactory;
import org.apache.tika.server.ServerStatus;
import org.apache.tika.server.TikaServerParseException;
+import org.apache.tika.utils.ExceptionUtils;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
@@ -56,6 +61,7 @@ import javax.ws.rs.GET;
import javax.ws.rs.POST;
import javax.ws.rs.PUT;
import javax.ws.rs.Path;
+import javax.ws.rs.PathParam;
import javax.ws.rs.Produces;
import javax.ws.rs.WebApplicationException;
import javax.ws.rs.core.Context;
@@ -83,6 +89,8 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.nio.charset.StandardCharsets.UTF_8;
+import static
org.apache.tika.server.resource.RecursiveMetadataResource.DEFAULT_HANDLER_TYPE;
+import static
org.apache.tika.server.resource.RecursiveMetadataResource.HANDLER_TYPE_PARAM;
@Path("/tika")
public class TikaResource {
@@ -103,9 +111,12 @@ public class TikaResource {
private static DigestingParser.Digester digester = null;
private static InputStreamFactory inputStreamFactory = null;
private static ServerStatus SERVER_STATUS = null;
- public static void init(TikaConfig config, DigestingParser.Digester
digestr,
+ private static boolean INCLUDE_STACK_TRACE = false;
+ public static void init(TikaConfig config,
+ boolean includeStackTrace,
DigestingParser.Digester digestr,
InputStreamFactory iSF, ServerStatus serverStatus)
{
tikaConfig = config;
+ INCLUDE_STACK_TRACE = includeStackTrace;
digester = digestr;
inputStreamFactory = iSF;
SERVER_STATUS = serverStatus;
@@ -586,6 +597,88 @@ public class TikaResource {
metadata, httpHeaders.getRequestHeaders(), info, "xml");
}
+
+ @POST
+ @Consumes("multipart/form-data")
+ @Produces("application/json")
+ @Path("form{" + HANDLER_TYPE_PARAM + " : (\\w+)?}")
+ public Metadata getJsonFromMultipart(Attachment att,
+ @Context HttpHeaders httpHeaders,
+ @Context final UriInfo info,
+ @PathParam(HANDLER_TYPE_PARAM)
+ String handlerTypeName)
+ throws IOException, TikaException {
+ Metadata metadata = new Metadata();
+ parseToMetadata(getInputStream(att.getObject(InputStream.class),
metadata, httpHeaders),
+ metadata, preparePostHeaderMap(att, httpHeaders), info,
handlerTypeName);
+ TikaResource.getConfig().getMetadataFilter().filter(metadata);
+ return metadata;
+ }
+
+ @PUT
+ @Consumes("*/*")
+ @Produces("application/json")
+ @Path("{" + HANDLER_TYPE_PARAM + " : (\\w+)?}")
+ public Metadata getJson(final InputStream is, @Context
+ HttpHeaders httpHeaders,
+ @Context final UriInfo info,
@PathParam(HANDLER_TYPE_PARAM)
+ String handlerTypeName)
+ throws IOException, TikaException {
+ Metadata metadata = new Metadata();
+ parseToMetadata(getInputStream(is, metadata, httpHeaders), metadata,
+ httpHeaders.getRequestHeaders(), info, handlerTypeName);
+ TikaResource.getConfig().getMetadataFilter().filter(metadata);
+ return metadata;
+ }
+
+ private void parseToMetadata(InputStream inputStream,
+ Metadata metadata,
+ MultivaluedMap<String, String> httpHeaders,
+ UriInfo info, String handlerTypeName) throws
IOException {
+ final Parser parser = createParser();
+ final ParseContext context = new ParseContext();
+
+ fillMetadata(parser, metadata, context, httpHeaders);
+ fillParseContext(context, httpHeaders, parser);
+
+
+ logRequest(LOG, info, metadata);
+ int writeLimit = -1;
+ if (httpHeaders.containsKey("writeLimit")) {
+ writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit"));
+ }
+ BasicContentHandlerFactory.HANDLER_TYPE type =
+ BasicContentHandlerFactory.parseHandlerType(handlerTypeName,
DEFAULT_HANDLER_TYPE);
+ BasicContentHandlerFactory fact = new BasicContentHandlerFactory(type,
writeLimit);
+ ContentHandler contentHandler = fact.getNewContentHandler();
+
+ try {
+ parse(parser, LOG, info.getPath(), inputStream, contentHandler,
metadata, context);
+ } catch (TikaServerParseException e) {
+ if (INCLUDE_STACK_TRACE) {
+ Throwable cause = e.getCause();
+ if (cause != null) {
+
metadata.add(AbstractRecursiveParserWrapperHandler.CONTAINER_EXCEPTION,
+ ExceptionUtils.getStackTrace(cause));
+ } else {
+
metadata.add(AbstractRecursiveParserWrapperHandler.CONTAINER_EXCEPTION,
+ ExceptionUtils.getStackTrace(e));
+ }
+ } else {
+ throw e;
+ }
+ } catch (OutOfMemoryError e) {
+ if (INCLUDE_STACK_TRACE) {
+
metadata.add(AbstractRecursiveParserWrapperHandler.CONTAINER_EXCEPTION,
+ ExceptionUtils.getStackTrace(e));
+ } else {
+ throw e;
+ }
+ } finally {
+ metadata.add(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT,
contentHandler.toString());
+ }
+ }
+
private StreamingOutput produceOutput(final InputStream is, Metadata
metadata, final MultivaluedMap<String, String> httpHeaders,
final UriInfo info, final String
format) {
final Parser parser = createParser();
diff --git a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
index ada3ce7..cfbafd0 100644
--- a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
+++ b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
@@ -92,7 +92,7 @@ public abstract class CXFTestBase {
public void setUp() throws Exception {
this.tika = new TikaConfig(getTikaConfigInputStream());
- TikaResource.init(tika,
+ TikaResource.init(tika, isIncludeStackTrace(),
new CommonsDigester(DIGESTER_READ_LIMIT, "md5,sha1:32"),
new DefaultInputStreamFactory(), new ServerStatus("", 0,true));
JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
@@ -121,6 +121,10 @@ public abstract class CXFTestBase {
server = sf.create();
}
+ protected boolean isIncludeStackTrace() {
+ return false;
+ }
+
protected InputStream getTikaConfigInputStream() {
return
getClass().getResourceAsStream("tika-config-for-server-tests.xml");
}
diff --git
a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
index 36ddf3c..d0c84c7 100644
---
a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
+++
b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
@@ -60,6 +60,7 @@ public class RecursiveMetadataResourceTest extends
CXFTestBase {
private static final String SLASH = "/";
private static final String TEST_RECURSIVE_DOC =
"test_recursive_embedded.docx";
+ private static final String TEST_NULL_POINTER = "mock/null_pointer.xml";
@Override
protected void setUpResources(JAXRSServerFactoryBean sf) {
@@ -372,4 +373,20 @@ public class RecursiveMetadataResourceTest extends
CXFTestBase {
metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
}
+
+ @Test
+ public void testNPE() throws Exception {
+ Response response = WebClient.create(endPoint +
META_PATH).accept("application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_NULL_POINTER));
+
+ Reader reader = new InputStreamReader((InputStream)
response.getEntity(), UTF_8);
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+ Metadata metadata = metadataList.get(0);
+ assertEquals("Nikolai Lobachevsky", metadata.get("author"));
+ assertEquals("application/mock+xml",
metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("some content",
metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ assertContains("null pointer message",
+
metadata.get(AbstractRecursiveParserWrapperHandler.CONTAINER_EXCEPTION));
+
+ }
}
diff --git
a/tika-server/src/test/java/org/apache/tika/server/StackTraceTest.java
b/tika-server/src/test/java/org/apache/tika/server/StackTraceTest.java
index 5115fd0..b3821b3 100644
--- a/tika-server/src/test/java/org/apache/tika/server/StackTraceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/StackTraceTest.java
@@ -78,15 +78,24 @@ public class StackTraceTest extends CXFTestBase {
sf.setProviders(providers);
}
+ @Override
+ protected boolean isIncludeStackTrace() {
+ return true;
+ }
+
@Test
public void testEncrypted() throws Exception {
for (String path : PATHS) {
if ("/rmeta".equals(path)) {
continue;
}
+ String accept = "*/*";
+ if ("/tika".equals(path)) {
+ accept = "text/plain";
+ }
Response response = WebClient
.create(endPoint + path)
- .accept("*/*")
+ .accept(accept)
.header("Content-Disposition",
"attachment; filename=" + TEST_PASSWORD_PROTECTED)
.put(ClassLoader.getSystemResourceAsStream(TEST_PASSWORD_PROTECTED));
@@ -105,9 +114,13 @@ public class StackTraceTest extends CXFTestBase {
if ("/rmeta".equals(path)) {
continue;
}
+ String accept = "*/*";
+ if ("/tika".equals(path)) {
+ accept = "text/plain";
+ }
Response response = WebClient
.create(endPoint + path)
- .accept("*/*")
+ .accept(accept)
.put(ClassLoader.getSystemResourceAsStream(TEST_NULL));
assertNotNull("null response: " + path, response);
assertEquals("unprocessable: " + path, UNPROCESSEABLE,
response.getStatus());
diff --git
a/tika-server/src/test/java/org/apache/tika/server/TikaResourceMetadataFilterTest.java
b/tika-server/src/test/java/org/apache/tika/server/TikaResourceMetadataFilterTest.java
new file mode 100644
index 0000000..4bf44ad
--- /dev/null
+++
b/tika-server/src/test/java/org/apache/tika/server/TikaResourceMetadataFilterTest.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.junit.Test;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadata;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.server.resource.TikaResource;
+import org.apache.tika.server.writer.JSONMessageBodyWriter;
+
+public class TikaResourceMetadataFilterTest extends CXFTestBase {
+
+ public static final String TEST_HELLO_WORLD = "mock/hello_world.xml";
+
+ private static final String TIKA_PATH = "/tika";
+
+ @Override
+ protected InputStream getTikaConfigInputStream() {
+ return
getClass().getResourceAsStream("/configs/metadata-filter-include.xml");
+ }
+
+ @Override
+ protected void setUpResources(JAXRSServerFactoryBean sf) {
+ sf.setResourceClasses(TikaResource.class);
+ sf.setResourceProvider(TikaResource.class,
+ new SingletonResourceProvider(new TikaResource()));
+ }
+
+ @Override
+ protected void setUpProviders(JAXRSServerFactoryBean sf) {
+ List<Object> providers = new ArrayList<Object>();
+ providers.add(new TikaServerParseExceptionMapper(false));
+ providers.add(new JSONMessageBodyWriter());
+ sf.setProviders(providers);
+ }
+
+
+ @Test
+ public void testBasic() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH).accept(
+ "application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD));
+ Metadata metadata =
+ JsonMetadata.fromJson(new InputStreamReader(
+ ((InputStream)response.getEntity()),
StandardCharsets.UTF_8));
+ assertEquals(2, metadata.names().length);
+ assertNull(metadata.get("author"));
+ assertEquals("application/mock+xml",
metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("hello world",
metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ }
+
+}
diff --git
a/tika-server/src/test/java/org/apache/tika/server/TikaResourceNoStackTest.java
b/tika-server/src/test/java/org/apache/tika/server/TikaResourceNoStackTest.java
new file mode 100644
index 0000000..4f231b2
--- /dev/null
+++
b/tika-server/src/test/java/org/apache/tika/server/TikaResourceNoStackTest.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.junit.Test;
+
+import org.apache.tika.server.resource.TikaResource;
+import org.apache.tika.server.writer.JSONMessageBodyWriter;
+
+public class TikaResourceNoStackTest extends CXFTestBase {
+
+ public static final String TEST_HELLO_WORLD_LONG =
"mock/hello_world_long.xml";
+ public static final String TEST_NULL_POINTER = "mock/null_pointer.xml";
+
+ private static final String TIKA_PATH = "/tika";
+
+ @Override
+ protected boolean isIncludeStackTrace() {
+ return false;
+ }
+
+ @Override
+ protected void setUpResources(JAXRSServerFactoryBean sf) {
+ sf.setResourceClasses(TikaResource.class);
+ sf.setResourceProvider(TikaResource.class,
+ new SingletonResourceProvider(new TikaResource()));
+ }
+
+ @Override
+ protected void setUpProviders(JAXRSServerFactoryBean sf) {
+ List<Object> providers = new ArrayList<Object>();
+ providers.add(new TikaServerParseExceptionMapper(false));
+ providers.add(new JSONMessageBodyWriter());
+ sf.setProviders(providers);
+ }
+
+ @Test
+ public void testJsonNPE() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH).accept(
+ "application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_NULL_POINTER));
+ assertEquals(422, response.getStatus());
+ String content = getStringFromInputStream((InputStream)
response.getEntity());
+ assertEquals(0, content.length());
+ }
+
+ @Test
+ public void testJsonWriteLimit() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .header("writeLimit", "100")
+ .accept("application/json")
+
.put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD_LONG));
+ assertEquals(500, response.getStatus());
+ String content = getStringFromInputStream((InputStream)
response.getEntity());
+ assertEquals(0, content.length());
+ }
+
+}
diff --git
a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index ddfd316..8940a18 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -26,9 +26,16 @@ import org.apache.cxf.jaxrs.ext.multipart.Attachment;
import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadata;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParser;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.server.resource.TikaResource;
+import org.apache.tika.server.writer.JSONMessageBodyWriter;
+
import org.junit.Test;
import javax.ws.rs.ProcessingException;
@@ -36,6 +43,7 @@ import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import java.io.FileNotFoundException;
import java.io.InputStream;
+import java.io.InputStreamReader;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
@@ -52,6 +60,11 @@ public class TikaResourceTest extends CXFTestBase {
public static final String TEST_PASSWORD_PROTECTED = "password.xls";
private static final String TEST_RECURSIVE_DOC =
"test_recursive_embedded.docx";
private static final String TEST_OOM = "mock/fake_oom.xml";
+ public static final String TEST_HELLO_WORLD = "mock/hello_world.xml";
+ public static final String TEST_HELLO_WORLD_LONG =
"mock/hello_world_long.xml";
+ public static final String TEST_NULL_POINTER = "mock/null_pointer.xml";
+
+
private static final String STREAM_CLOSED_FAULT = "java.io.IOException:
Stream Closed";
@@ -59,6 +72,12 @@ public class TikaResourceTest extends CXFTestBase {
private static final String TIKA_POST_PATH = "/tika/form";
private static final int UNPROCESSEABLE = 422;
+
+ @Override
+ protected boolean isIncludeStackTrace() {
+ return true;
+ }
+
@Override
protected void setUpResources(JAXRSServerFactoryBean sf) {
sf.setResourceClasses(TikaResource.class);
@@ -69,6 +88,7 @@ public class TikaResourceTest extends CXFTestBase {
@Override
protected void setUpProviders(JAXRSServerFactoryBean sf) {
List<Object> providers = new ArrayList<Object>();
+ providers.add(new JSONMessageBodyWriter());
providers.add(new TikaServerParseExceptionMapper(false));
sf.setProviders(providers);
}
@@ -604,4 +624,82 @@ public class TikaResourceTest extends CXFTestBase {
return new MultipartBody(att);
}
+ @Test
+ public void testJson() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH).accept(
+ "application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD));
+ Metadata metadata =
+ JsonMetadata.fromJson(new InputStreamReader(
+ ((InputStream)response.getEntity()),
StandardCharsets.UTF_8));
+
+ assertEquals("Nikolai Lobachevsky", metadata.get("author"));
+ assertEquals("application/mock+xml",
metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("hello world",
+
metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ }
+
+ @Test
+ public void testJsonNPE() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH).accept(
+ "application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_NULL_POINTER));
+ Metadata metadata =
+ JsonMetadata.fromJson(new InputStreamReader(
+ ((InputStream)response.getEntity()),
StandardCharsets.UTF_8));
+
+ assertEquals("Nikolai Lobachevsky", metadata.get("author"));
+ assertEquals("application/mock+xml",
metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("some content",
metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ assertContains("null pointer message",
+
metadata.get(AbstractRecursiveParserWrapperHandler.CONTAINER_EXCEPTION));
+ }
+
+ @Test
+ public void testJsonWriteLimit() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .header("writeLimit", "100")
+ .accept("application/json")
+
.put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD_LONG));
+ Metadata metadata =
+ JsonMetadata.fromJson(new InputStreamReader(
+ ((InputStream)response.getEntity()),
StandardCharsets.UTF_8));
+
+ assertEquals("Nikolai Lobachevsky", metadata.get("author"));
+ assertEquals("application/mock+xml",
metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("Hello world",
metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ assertNotFound("dissolve",
metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+
assertTrue(metadata.get(AbstractRecursiveParserWrapperHandler.CONTAINER_EXCEPTION).startsWith(
+
"org.apache.tika.sax.WriteOutContentHandler$WriteLimitReachedException"
+ ));
+ }
+
+ @Test
+ public void testJsonHandlerType() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .accept("application/json")
+
.put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD_LONG));
+ Metadata metadata =
+ JsonMetadata.fromJson(new InputStreamReader(
+ ((InputStream)response.getEntity()),
StandardCharsets.UTF_8));
+
+ assertEquals("Nikolai Lobachevsky", metadata.get("author"));
+ assertEquals("application/mock+xml",
metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("Hello world",
metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ //default is xhtml
+ assertContains("<p>",
metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+
+ response = WebClient.create(endPoint + TIKA_PATH + "/text")
+ .accept("application/json")
+
.put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD_LONG));
+ metadata =
+ JsonMetadata.fromJson(new InputStreamReader(
+ ((InputStream)response.getEntity()),
StandardCharsets.UTF_8));
+
+ assertEquals("Nikolai Lobachevsky", metadata.get("author"));
+ assertEquals("application/mock+xml",
metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("Hello world",
metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ assertNotFound("<p>",
metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
+ }
+
}
diff --git a/tika-server/src/test/resources/configs/metadata-filter-include.xml
b/tika-server/src/test/resources/configs/metadata-filter-include.xml
new file mode 100644
index 0000000..3a7a7c1
--- /dev/null
+++ b/tika-server/src/test/resources/configs/metadata-filter-include.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <metadataFilters>
+ <metadataFilter
class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter">
+ <params>
+ <param name="include" type="list">
+ <string>X-TIKA:content</string>
+ <string>extended-properties:Application</string>
+ <string>Content-Type</string>
+ </param>
+ </params>
+ </metadataFilter>
+ </metadataFilters>
+</properties>
diff --git a/tika-server/src/test/resources/mock/hello_world.xml
b/tika-server/src/test/resources/mock/hello_world.xml
new file mode 100644
index 0000000..27cd62a
--- /dev/null
+++ b/tika-server/src/test/resources/mock/hello_world.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<mock>
+ <metadata action="add" name="author">Nikolai Lobachevsky</metadata>
+ <metadata action="add" name="title">你好,世界</metadata>
+ <metadata action="add" name="my-key">parsers-value</metadata>
+ <write element="p">hello world</write>
+</mock>
\ No newline at end of file
diff --git a/tika-server/src/test/resources/mock/hello_world_long.xml
b/tika-server/src/test/resources/mock/hello_world_long.xml
new file mode 100644
index 0000000..bf06ad2
--- /dev/null
+++ b/tika-server/src/test/resources/mock/hello_world_long.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<mock>
+ <metadata action="add" name="author">Nikolai Lobachevsky</metadata>
+ <metadata action="add" name="title">你好,世界</metadata>
+ <metadata action="add" name="my-key">parsers-value</metadata>
+ <write element="p">Hello world...</write>
+ <write element="p">When in the Course of human events, it becomes
necessary for one people to dissolve the
+ political bands which have connected them with another, and to assume
among the powers of the earth, the
+ separate and equal station to which the Laws of Nature and of Nature’s
God entitle them, a decent respect
+ to the opinions of mankind requires that they should declare the
causes which impel them to the separation.</write>
+</mock>
\ No newline at end of file