This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3226
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-3226 by this push:
new 69b8fbb TIKA-3226 -- WIP do not merge -- add emitter test, other
cleanup
69b8fbb is described below
commit 69b8fbb16a453ecd968f58b58d117876c5b19d48
Author: tballison <[email protected]>
AuthorDate: Wed Jan 20 11:00:44 2021 -0500
TIKA-3226 -- WIP do not merge -- add emitter test, other cleanup
---
.../org/apache/tika/fetcher/FileSystemFetcher.java | 2 +-
.../apache/tika/emitter/fs/FileSystemEmitter.java | 7 +-
.../apache/tika/server/classic/FetcherTest.java | 11 +-
tika-server/tika-server-core/pom.xml | 6 +
.../tika/server/core/resource/EmitterResource.java | 95 ++++++++++---
.../tika/server/core/resource/TikaResource.java | 4 +-
.../apache/tika/server/core/StackTraceOffTest.java | 5 +-
.../apache/tika/server/core/StackTraceTest.java | 3 +-
.../apache/tika/server/core/TikaEmitterTest.java | 150 +++++++++++++++++++++
9 files changed, 248 insertions(+), 35 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
b/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
index 83a6677..b93f202 100644
--- a/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
+++ b/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
@@ -33,7 +33,7 @@ import java.util.Set;
public class FileSystemFetcher implements Fetcher {
- private static String PREFIX = "file";
+ private static String PREFIX = "fs";
private static final Set<String> SUPPORTED = Collections.singleton(PREFIX);
private Path basePath = null;
@Override
diff --git
a/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/emitter/fs/FileSystemEmitter.java
b/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/emitter/fs/FileSystemEmitter.java
index a99c013..54219a8 100644
---
a/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/emitter/fs/FileSystemEmitter.java
+++
b/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/emitter/fs/FileSystemEmitter.java
@@ -39,6 +39,9 @@ public class FileSystemEmitter implements Emitter {
String relPath = metadataList.get(0)
.get(TikaCoreProperties.SOURCE_PATH);
+ if (fileExtension != null && fileExtension.length() > 0) {
+ relPath += "." + fileExtension;
+ }
if (basePath != null) {
output = basePath.resolve(relPath);
} else {
@@ -54,8 +57,8 @@ public class FileSystemEmitter implements Emitter {
}
@Field
- public void setBasePath(Path basePath) {
- this.basePath = basePath;
+ public void setBasePath(String basePath) {
+ this.basePath = Paths.get(basePath);
}
/**
diff --git
a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/FetcherTest.java
b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/FetcherTest.java
index 7ff931e..94effe3 100644
---
a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/FetcherTest.java
+++
b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/FetcherTest.java
@@ -20,27 +20,19 @@ package org.apache.tika.server.classic;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
import org.apache.cxf.jaxrs.client.WebClient;
-import org.apache.cxf.jaxrs.ext.multipart.Attachment;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.serialization.JsonMetadataList;
-import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
-import org.apache.tika.sax.RecursiveParserWrapperHandler;
import org.apache.tika.server.core.CXFTestBase;
-import org.apache.tika.server.core.DefaultInputStreamFactory;
import org.apache.tika.server.core.FetcherStreamFactory;
import org.apache.tika.server.core.InputStreamFactory;
import org.apache.tika.server.core.resource.RecursiveMetadataResource;
-import org.apache.tika.server.core.resource.TikaResource;
import org.apache.tika.server.core.writer.MetadataListMessageBodyWriter;
+import org.junit.Ignore;
import org.junit.Test;
-import javax.ws.rs.core.MultivaluedHashMap;
-import javax.ws.rs.core.MultivaluedMap;
import javax.ws.rs.core.Response;
import java.io.InputStream;
import java.io.InputStreamReader;
@@ -54,6 +46,7 @@ import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
+@Ignore("turn into actual unit tests")
public class FetcherTest extends CXFTestBase {
private static final String META_PATH = "/rmeta";
diff --git a/tika-server/tika-server-core/pom.xml
b/tika-server/tika-server-core/pom.xml
index 11e6664..740debe 100644
--- a/tika-server/tika-server-core/pom.xml
+++ b/tika-server/tika-server-core/pom.xml
@@ -185,6 +185,12 @@
</dependency>
<dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-emitter-fs</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java
index 3bb0b0e..b706420 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java
@@ -17,30 +17,24 @@
package org.apache.tika.server.core.resource;
-import org.apache.cxf.jaxrs.ext.multipart.Attachment;
import org.apache.tika.emitter.Emitter;
import org.apache.tika.emitter.TikaEmitterException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.fetcher.Fetcher;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.RecursiveParserWrapper;
-import org.apache.tika.sax.BasicContentHandlerFactory;
-import org.apache.tika.sax.RecursiveParserWrapperHandler;
-import org.apache.tika.server.core.MetadataList;
import org.apache.tika.utils.ExceptionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import javax.ws.rs.Consumes;
+import javax.ws.rs.GET;
import javax.ws.rs.POST;
import javax.ws.rs.PUT;
import javax.ws.rs.Path;
import javax.ws.rs.PathParam;
import javax.ws.rs.Produces;
+import javax.ws.rs.QueryParam;
import javax.ws.rs.core.Context;
import javax.ws.rs.core.HttpHeaders;
-import javax.ws.rs.core.MultivaluedMap;
-import javax.ws.rs.core.Response;
import javax.ws.rs.core.UriInfo;
import java.io.IOException;
import java.io.InputStream;
@@ -48,17 +42,45 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import static org.apache.tika.server.core.resource.TikaResource.fillMetadata;
-import static
org.apache.tika.server.core.resource.TikaResource.fillParseContext;
-
@Path("/emit")
public class EmitterResource {
private static final String EMITTER_PARAM = "emitter";
+ private static final String FETCH_STRING = "fetchString";
private static final Logger LOG =
LoggerFactory.getLogger(EmitterResource.class);
/**
+ *
+ * @param httpHeaders
+ * @param info
+ * @param emitterName
+ * @param fetchString specify the fetch string in the url's query section
+ * @return
+ * @throws Exception
+ */
+ @GET
+ @Produces("application/json")
+ @Path("{" + EMITTER_PARAM + " : (\\w+)?}")
+ public Map<String, String> getMetadata(InputStream is, @Context
HttpHeaders httpHeaders,
+ @Context UriInfo info,
+ @PathParam(EMITTER_PARAM) String
emitterName,
+ @QueryParam(FETCH_STRING) String
fetchString) throws Exception {
+
+ Metadata metadata = new Metadata();
+ Fetcher fetcher = TikaResource.getConfig().getFetcher();
+ List<Metadata> metadataList;
+ try (InputStream fetchedIs = fetcher.fetch(fetchString, metadata)) {
+ metadataList =
+ RecursiveMetadataResource.parseMetadata(fetchedIs,
+ metadata,
+ httpHeaders.getRequestHeaders(), info, "text");
+ }
+ return emit(emitterName, metadataList);
+ }
+
+
+ /**
* Returns an InputStream that can be deserialized as a list of
* {@link Metadata} objects.
* The first in the list represents the main document, and the
@@ -76,26 +98,63 @@ public class EmitterResource {
* @return InputStream that can be deserialized as a list of {@link
Metadata} objects
* @throws Exception
*/
-
@PUT
@Produces("application/json")
@Path("{" + EMITTER_PARAM + " : (\\w+)?}")
+ public Map<String, String> getMetadataFromInputStream(InputStream is,
+ @Context HttpHeaders httpHeaders,
+ @Context UriInfo info,
+ @PathParam(EMITTER_PARAM) String
emitterName
+ ) throws Exception {
+
+ Metadata metadata = new Metadata();
+ List<Metadata> metadataList =
+ RecursiveMetadataResource.parseMetadata(is,
+ metadata,
+ httpHeaders.getRequestHeaders(), info, "text");
+ return emit(emitterName, metadataList);
+ }
+
+ /**
+ * Returns an InputStream that can be deserialized as a list of
+ * {@link Metadata} objects.
+ * The first in the list represents the main document, and the
+ * rest represent metadata for the embedded objects. This works
+ * recursively through all descendants of the main document, not
+ * just the immediate children.
+ * <p>
+ * The extracted text content is stored with the key
+ * {@link
org.apache.tika.sax.AbstractRecursiveParserWrapperHandler#TIKA_CONTENT}
+ * <p>
+ * Must specify an emitter in the path, e.g. /emit/solr
+ * @param info uri info
+ * @param emitterName which emitter to use; emitters must be configured in
+ * the TikaConfig file.
+ * @return InputStream that can be deserialized as a list of {@link
Metadata} objects
+ * @throws Exception
+ */
+ @POST
+ @Produces("application/json")
+ @Path("{" + EMITTER_PARAM + " : (\\w+)?}")
public Map<String, String> getMetadata(InputStream is,
@Context HttpHeaders httpHeaders,
@Context UriInfo info,
@PathParam(EMITTER_PARAM) String emitterName
) throws Exception {
- String status = "ok";
- String exceptionMsg = "";
Metadata metadata = new Metadata();
List<Metadata> metadataList =
RecursiveMetadataResource.parseMetadata(TikaResource.getInputStream(is,
metadata,
httpHeaders),
- metadata,
-
httpHeaders.getRequestHeaders(), info, "text");
+ metadata,
+ httpHeaders.getRequestHeaders(), info, "text");
+ return emit(emitterName, metadataList);
+ }
+ private Map<String, String> emit(String emitterName, List<Metadata>
metadataList) throws TikaException {
Emitter emitter = TikaResource.getConfig().getEmitter();
+ String status = "ok";
+ String exceptionMsg = "";
try {
emitter.emit(emitterName, metadataList);
} catch (IOException|TikaEmitterException e) {
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 896f50d..46cf093 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -279,7 +279,9 @@ public class TikaResource {
}
String contentTypeHeader =
httpHeaders.getFirst(HttpHeaders.CONTENT_TYPE);
- javax.ws.rs.core.MediaType mediaType = contentTypeHeader == null ? null
+ javax.ws.rs.core.MediaType mediaType =
+ (contentTypeHeader == null || "*/*".equals(contentTypeHeader))
+ ? null
: javax.ws.rs.core.MediaType.valueOf(contentTypeHeader);
if (mediaType != null && "xml".equals(mediaType.getSubtype())) {
mediaType = null;
diff --git
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceOffTest.java
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceOffTest.java
index 2c418f0..70a6b78 100644
---
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceOffTest.java
+++
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceOffTest.java
@@ -1,6 +1,4 @@
-package org.apache.tika.server.core;
-
-/*
+/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -16,6 +14,7 @@ package org.apache.tika.server.core;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+package org.apache.tika.server.core;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
diff --git
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceTest.java
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceTest.java
index 90c9a6d..0df6a78 100644
---
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceTest.java
+++
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceTest.java
@@ -1,4 +1,3 @@
-package org.apache.tika.server.core;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -15,6 +14,8 @@ package org.apache.tika.server.core;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+package org.apache.tika.server.core;
+
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
diff --git
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaEmitterTest.java
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaEmitterTest.java
new file mode 100644
index 0000000..ec26812
--- /dev/null
+++
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaEmitterTest.java
@@ -0,0 +1,150 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server.core;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.lifecycle.ResourceProvider;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.server.core.resource.EmitterResource;
+import org.apache.tika.server.core.writer.JSONObjWriter;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import javax.ws.rs.core.Response;
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.io.Reader;
+import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * This offers basic integration tests with fetchers and emitters.
+ * We use file system fetchers and emitters.
+ */
+public class TikaEmitterTest extends CXFTestBase {
+
+ private static final String EMITTER_PATH = "/emit/fs";
+ private static Path TMP_DIR;
+ private static Path TMP_OUTPUT_DIR;
+ private static Path TMP_OUTPUT_FILE;
+ private static String TIKA_CONFIG_XML;
+
+ @BeforeClass
+ public static void setUpBeforeClass() throws Exception {
+ TMP_DIR = Files.createTempDirectory("tika-emitter-test-");
+ Path inputDir = TMP_DIR.resolve("input");
+ TMP_OUTPUT_DIR = TMP_DIR.resolve("output");
+ TMP_OUTPUT_FILE = TMP_OUTPUT_DIR.resolve("hello_world.xml.json");
+ Files.createDirectories(inputDir);
+ Files.createDirectories(TMP_OUTPUT_DIR);
+
Files.copy(TikaEmitterTest.class.getResourceAsStream("/test-documents/mock/hello_world.xml"),
+ inputDir.resolve("hello_world.xml"));
+ TIKA_CONFIG_XML = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"+
+ "<properties>"+
+ "<fetchers>"+
+ "<fetcher
class=\"org.apache.tika.fetcher.FileSystemFetcher\">"+
+ "<params>"+
+ "<param name=\"basePath\"
type=\"string\">"+inputDir.toAbsolutePath()+"</param>"+
+ "</params>"+
+ "</fetcher>"+
+ "</fetchers>"+
+ "<emitters>"+
+ "<emitter
class=\"org.apache.tika.emitter.fs.FileSystemEmitter\">"+
+ "<params>"+
+ "<param name=\"basePath\" type=\"string\">"+
TMP_OUTPUT_DIR.toAbsolutePath()+"</param>"+
+ "</params>"+
+ "</emitter>"+
+ "</emitters>"+
+ "</properties>";
+ }
+
+ @AfterClass
+ public static void tearDownAfterClass() throws Exception {
+ FileUtils.deleteDirectory(TMP_DIR.toFile());
+ }
+
+ @Before
+ public void setUpEachTest() throws Exception {
+ if (Files.exists(TMP_OUTPUT_FILE)) {
+ Files.delete(TMP_OUTPUT_FILE);
+ }
+ }
+
+ @Override
+ protected void setUpResources(JAXRSServerFactoryBean sf) {
+ List<ResourceProvider> rCoreProviders = new
ArrayList<ResourceProvider>();
+ rCoreProviders.add(new SingletonResourceProvider(new
EmitterResource()));
+ sf.setResourceProviders(rCoreProviders);
+ }
+
+ @Override
+ protected void setUpProviders(JAXRSServerFactoryBean sf) {
+ List<Object> providers = new ArrayList<>();
+ providers.add(new JSONObjWriter());
+ sf.setProviders(providers);
+ }
+
+ @Override
+ protected InputStream getTikaConfigInputStream() {
+ return new
ByteArrayInputStream(TIKA_CONFIG_XML.getBytes(StandardCharsets.UTF_8));
+ }
+
+ @Override
+ protected InputStreamFactory getInputStreamFactory(TikaConfig tikaConfig) {
+ return new FetcherStreamFactory(tikaConfig.getFetcher());
+ }
+
+ @Test
+ public void testGet() throws Exception {
+ String q = "?fetchString="+ URLEncoder.encode("fs:hello_world.xml",
StandardCharsets.UTF_8.name());
+ String getUrl = endPoint+EMITTER_PATH+q;
+ Response response = WebClient
+ .create(getUrl)
+ .accept("application/json").get();
+ assertEquals(200, response.getStatus());
+ Path targetFile = TMP_OUTPUT_DIR.resolve("hello_world.xml.json");
+ List<Metadata> metadataList = null;
+ try (Reader reader = Files.newBufferedReader(targetFile)) {
+ metadataList = JsonMetadataList.fromJson(reader);
+ }
+ assertEquals(1, metadataList.size());
+ Metadata metadata = metadataList.get(0);
+ assertEquals("hello world",
+
metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim());
+ assertEquals("Nikolai Lobachevsky", metadata.get("author"));
+ assertEquals("你好,世界", metadata.get("title"));
+ assertEquals("application/mock+xml",
metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+ //TODO: add put and post
+
+}