This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3226
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/TIKA-3226 by this push:
     new 69b8fbb  TIKA-3226 -- WIP do not merge -- add emitter test, other 
cleanup
69b8fbb is described below

commit 69b8fbb16a453ecd968f58b58d117876c5b19d48
Author: tballison <[email protected]>
AuthorDate: Wed Jan 20 11:00:44 2021 -0500

    TIKA-3226 -- WIP do not merge -- add emitter test, other cleanup
---
 .../org/apache/tika/fetcher/FileSystemFetcher.java |   2 +-
 .../apache/tika/emitter/fs/FileSystemEmitter.java  |   7 +-
 .../apache/tika/server/classic/FetcherTest.java    |  11 +-
 tika-server/tika-server-core/pom.xml               |   6 +
 .../tika/server/core/resource/EmitterResource.java |  95 ++++++++++---
 .../tika/server/core/resource/TikaResource.java    |   4 +-
 .../apache/tika/server/core/StackTraceOffTest.java |   5 +-
 .../apache/tika/server/core/StackTraceTest.java    |   3 +-
 .../apache/tika/server/core/TikaEmitterTest.java   | 150 +++++++++++++++++++++
 9 files changed, 248 insertions(+), 35 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java 
b/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
index 83a6677..b93f202 100644
--- a/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
+++ b/tika-core/src/main/java/org/apache/tika/fetcher/FileSystemFetcher.java
@@ -33,7 +33,7 @@ import java.util.Set;
 
 public class FileSystemFetcher implements Fetcher {
 
-    private static String PREFIX = "file";
+    private static String PREFIX = "fs";
     private static final Set<String> SUPPORTED = Collections.singleton(PREFIX);
     private Path basePath = null;
     @Override
diff --git 
a/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/emitter/fs/FileSystemEmitter.java
 
b/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/emitter/fs/FileSystemEmitter.java
index a99c013..54219a8 100644
--- 
a/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/emitter/fs/FileSystemEmitter.java
+++ 
b/tika-emitters/tika-emitter-fs/src/main/java/org/apache/tika/emitter/fs/FileSystemEmitter.java
@@ -39,6 +39,9 @@ public class FileSystemEmitter implements Emitter {
         String relPath = metadataList.get(0)
                 .get(TikaCoreProperties.SOURCE_PATH);
 
+        if (fileExtension != null && fileExtension.length() > 0) {
+            relPath += "." + fileExtension;
+        }
         if (basePath != null) {
             output = basePath.resolve(relPath);
         } else {
@@ -54,8 +57,8 @@ public class FileSystemEmitter implements Emitter {
     }
 
     @Field
-    public void setBasePath(Path basePath) {
-        this.basePath = basePath;
+    public void setBasePath(String basePath) {
+        this.basePath = Paths.get(basePath);
     }
 
     /**
diff --git 
a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/FetcherTest.java
 
b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/FetcherTest.java
index 7ff931e..94effe3 100644
--- 
a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/FetcherTest.java
+++ 
b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/FetcherTest.java
@@ -20,27 +20,19 @@ package org.apache.tika.server.classic;
 import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
 import org.apache.cxf.jaxrs.client.WebClient;
-import org.apache.cxf.jaxrs.ext.multipart.Attachment;
 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
 import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.serialization.JsonMetadataList;
-import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
-import org.apache.tika.sax.RecursiveParserWrapperHandler;
 import org.apache.tika.server.core.CXFTestBase;
-import org.apache.tika.server.core.DefaultInputStreamFactory;
 import org.apache.tika.server.core.FetcherStreamFactory;
 import org.apache.tika.server.core.InputStreamFactory;
 import org.apache.tika.server.core.resource.RecursiveMetadataResource;
-import org.apache.tika.server.core.resource.TikaResource;
 import org.apache.tika.server.core.writer.MetadataListMessageBodyWriter;
+import org.junit.Ignore;
 import org.junit.Test;
 
-import javax.ws.rs.core.MultivaluedHashMap;
-import javax.ws.rs.core.MultivaluedMap;
 import javax.ws.rs.core.Response;
 import java.io.InputStream;
 import java.io.InputStreamReader;
@@ -54,6 +46,7 @@ import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 
+@Ignore("turn into actual unit tests")
 public class FetcherTest extends CXFTestBase {
 
     private static final String META_PATH = "/rmeta";
diff --git a/tika-server/tika-server-core/pom.xml 
b/tika-server/tika-server-core/pom.xml
index 11e6664..740debe 100644
--- a/tika-server/tika-server-core/pom.xml
+++ b/tika-server/tika-server-core/pom.xml
@@ -185,6 +185,12 @@
         </dependency>
 
         <dependency>
+            <groupId>${project.groupId}</groupId>
+            <artifactId>tika-emitter-fs</artifactId>
+            <version>${project.version}</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
             <groupId>junit</groupId>
             <artifactId>junit</artifactId>
             <scope>test</scope>
diff --git 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java
 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java
index 3bb0b0e..b706420 100644
--- 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java
+++ 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/EmitterResource.java
@@ -17,30 +17,24 @@
 
 package org.apache.tika.server.core.resource;
 
-import org.apache.cxf.jaxrs.ext.multipart.Attachment;
 import org.apache.tika.emitter.Emitter;
 import org.apache.tika.emitter.TikaEmitterException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.fetcher.Fetcher;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.RecursiveParserWrapper;
-import org.apache.tika.sax.BasicContentHandlerFactory;
-import org.apache.tika.sax.RecursiveParserWrapperHandler;
-import org.apache.tika.server.core.MetadataList;
 import org.apache.tika.utils.ExceptionUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import javax.ws.rs.Consumes;
+import javax.ws.rs.GET;
 import javax.ws.rs.POST;
 import javax.ws.rs.PUT;
 import javax.ws.rs.Path;
 import javax.ws.rs.PathParam;
 import javax.ws.rs.Produces;
+import javax.ws.rs.QueryParam;
 import javax.ws.rs.core.Context;
 import javax.ws.rs.core.HttpHeaders;
-import javax.ws.rs.core.MultivaluedMap;
-import javax.ws.rs.core.Response;
 import javax.ws.rs.core.UriInfo;
 import java.io.IOException;
 import java.io.InputStream;
@@ -48,17 +42,45 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
-import static org.apache.tika.server.core.resource.TikaResource.fillMetadata;
-import static 
org.apache.tika.server.core.resource.TikaResource.fillParseContext;
-
 @Path("/emit")
 public class EmitterResource {
 
     private static final String EMITTER_PARAM = "emitter";
+    private static final String FETCH_STRING = "fetchString";
     private static final Logger LOG = 
LoggerFactory.getLogger(EmitterResource.class);
 
 
     /**
+     *
+     * @param httpHeaders
+     * @param info
+     * @param emitterName
+     * @param fetchString specify the fetch string in the url's query section
+     * @return
+     * @throws Exception
+     */
+    @GET
+    @Produces("application/json")
+    @Path("{" + EMITTER_PARAM + " : (\\w+)?}")
+    public Map<String, String> getMetadata(InputStream is, @Context 
HttpHeaders httpHeaders,
+                                           @Context UriInfo info,
+                                           @PathParam(EMITTER_PARAM) String 
emitterName,
+                                           @QueryParam(FETCH_STRING) String 
fetchString) throws Exception {
+
+        Metadata metadata = new Metadata();
+        Fetcher fetcher = TikaResource.getConfig().getFetcher();
+        List<Metadata> metadataList;
+        try (InputStream fetchedIs = fetcher.fetch(fetchString, metadata)) {
+            metadataList =
+                    RecursiveMetadataResource.parseMetadata(fetchedIs,
+                            metadata,
+                            httpHeaders.getRequestHeaders(), info, "text");
+        }
+        return emit(emitterName, metadataList);
+    }
+
+
+    /**
      * Returns an InputStream that can be deserialized as a list of
      * {@link Metadata} objects.
      * The first in the list represents the main document, and the
@@ -76,26 +98,63 @@ public class EmitterResource {
      * @return InputStream that can be deserialized as a list of {@link 
Metadata} objects
      * @throws Exception
      */
-
     @PUT
     @Produces("application/json")
     @Path("{" + EMITTER_PARAM + " : (\\w+)?}")
+    public Map<String, String> getMetadataFromInputStream(InputStream is,
+                                           @Context HttpHeaders httpHeaders,
+                                           @Context UriInfo info,
+                                           @PathParam(EMITTER_PARAM) String 
emitterName
+    ) throws Exception {
+
+        Metadata metadata = new Metadata();
+        List<Metadata> metadataList =
+                RecursiveMetadataResource.parseMetadata(is,
+                        metadata,
+                        httpHeaders.getRequestHeaders(), info, "text");
+        return emit(emitterName, metadataList);
+    }
+
+    /**
+     * Returns an InputStream that can be deserialized as a list of
+     * {@link Metadata} objects.
+     * The first in the list represents the main document, and the
+     * rest represent metadata for the embedded objects.  This works
+     * recursively through all descendants of the main document, not
+     * just the immediate children.
+     * <p>
+     * The extracted text content is stored with the key
+     * {@link 
org.apache.tika.sax.AbstractRecursiveParserWrapperHandler#TIKA_CONTENT}
+     * <p>
+     * Must specify an emitter in the path, e.g. /emit/solr
+     * @param info uri info
+     * @param emitterName which emitter to use; emitters must be configured in
+     *                    the TikaConfig file.
+     * @return InputStream that can be deserialized as a list of {@link 
Metadata} objects
+     * @throws Exception
+     */
+    @POST
+    @Produces("application/json")
+    @Path("{" + EMITTER_PARAM + " : (\\w+)?}")
     public Map<String, String> getMetadata(InputStream is,
                                 @Context HttpHeaders httpHeaders,
                                 @Context UriInfo info,
                                 @PathParam(EMITTER_PARAM) String emitterName
                                 ) throws Exception {
 
-        String status = "ok";
-        String exceptionMsg = "";
         Metadata metadata = new Metadata();
         List<Metadata> metadataList =
                 
RecursiveMetadataResource.parseMetadata(TikaResource.getInputStream(is, 
metadata,
                         httpHeaders),
-                                               metadata,
-                                               
httpHeaders.getRequestHeaders(), info, "text");
+                        metadata,
+                        httpHeaders.getRequestHeaders(), info, "text");
+        return emit(emitterName, metadataList);
+    }
 
+    private Map<String, String> emit(String emitterName, List<Metadata> 
metadataList) throws TikaException {
         Emitter emitter = TikaResource.getConfig().getEmitter();
+        String status = "ok";
+        String exceptionMsg = "";
         try {
             emitter.emit(emitterName, metadataList);
         } catch (IOException|TikaEmitterException e) {
diff --git 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 896f50d..46cf093 100644
--- 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++ 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -279,7 +279,9 @@ public class TikaResource {
         }
 
         String contentTypeHeader = 
httpHeaders.getFirst(HttpHeaders.CONTENT_TYPE);
-        javax.ws.rs.core.MediaType mediaType = contentTypeHeader == null ? null
+        javax.ws.rs.core.MediaType mediaType =
+                (contentTypeHeader == null || "*/*".equals(contentTypeHeader))
+                        ? null
                 : javax.ws.rs.core.MediaType.valueOf(contentTypeHeader);
         if (mediaType != null && "xml".equals(mediaType.getSubtype())) {
             mediaType = null;
diff --git 
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceOffTest.java
 
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceOffTest.java
index 2c418f0..70a6b78 100644
--- 
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceOffTest.java
+++ 
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceOffTest.java
@@ -1,6 +1,4 @@
-package org.apache.tika.server.core;
-
-/*
+/**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -16,6 +14,7 @@ package org.apache.tika.server.core;
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.tika.server.core;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
diff --git 
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceTest.java
 
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceTest.java
index 90c9a6d..0df6a78 100644
--- 
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceTest.java
+++ 
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceTest.java
@@ -1,4 +1,3 @@
-package org.apache.tika.server.core;
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -15,6 +14,8 @@ package org.apache.tika.server.core;
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.tika.server.core;
+
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
diff --git 
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaEmitterTest.java
 
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaEmitterTest.java
new file mode 100644
index 0000000..ec26812
--- /dev/null
+++ 
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaEmitterTest.java
@@ -0,0 +1,150 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server.core;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.lifecycle.ResourceProvider;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.server.core.resource.EmitterResource;
+import org.apache.tika.server.core.writer.JSONObjWriter;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import javax.ws.rs.core.Response;
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.io.Reader;
+import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * This offers basic integration tests with fetchers and emitters.
+ * We use file system fetchers and emitters.
+ */
+public class TikaEmitterTest extends CXFTestBase {
+
+    private static final String EMITTER_PATH = "/emit/fs";
+    private static Path TMP_DIR;
+    private static Path TMP_OUTPUT_DIR;
+    private static Path TMP_OUTPUT_FILE;
+    private static String TIKA_CONFIG_XML;
+
+    @BeforeClass
+    public static void setUpBeforeClass() throws Exception {
+        TMP_DIR = Files.createTempDirectory("tika-emitter-test-");
+        Path inputDir = TMP_DIR.resolve("input");
+        TMP_OUTPUT_DIR = TMP_DIR.resolve("output");
+        TMP_OUTPUT_FILE = TMP_OUTPUT_DIR.resolve("hello_world.xml.json");
+        Files.createDirectories(inputDir);
+        Files.createDirectories(TMP_OUTPUT_DIR);
+        
Files.copy(TikaEmitterTest.class.getResourceAsStream("/test-documents/mock/hello_world.xml"),
+                inputDir.resolve("hello_world.xml"));
+        TIKA_CONFIG_XML = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"+
+                "<properties>"+
+                    "<fetchers>"+
+                        "<fetcher 
class=\"org.apache.tika.fetcher.FileSystemFetcher\">"+
+                            "<params>"+
+                                "<param name=\"basePath\" 
type=\"string\">"+inputDir.toAbsolutePath()+"</param>"+
+                            "</params>"+
+                        "</fetcher>"+
+                    "</fetchers>"+
+                    "<emitters>"+
+                        "<emitter 
class=\"org.apache.tika.emitter.fs.FileSystemEmitter\">"+
+                            "<params>"+
+                                "<param name=\"basePath\" type=\"string\">"+ 
TMP_OUTPUT_DIR.toAbsolutePath()+"</param>"+
+                            "</params>"+
+                        "</emitter>"+
+                    "</emitters>"+
+                "</properties>";
+    }
+
+    @AfterClass
+    public static void tearDownAfterClass() throws Exception {
+        FileUtils.deleteDirectory(TMP_DIR.toFile());
+    }
+
+    @Before
+    public void setUpEachTest() throws Exception {
+        if (Files.exists(TMP_OUTPUT_FILE)) {
+            Files.delete(TMP_OUTPUT_FILE);
+        }
+    }
+
+    @Override
+    protected void setUpResources(JAXRSServerFactoryBean sf) {
+        List<ResourceProvider> rCoreProviders = new 
ArrayList<ResourceProvider>();
+        rCoreProviders.add(new SingletonResourceProvider(new 
EmitterResource()));
+        sf.setResourceProviders(rCoreProviders);
+    }
+
+    @Override
+    protected void setUpProviders(JAXRSServerFactoryBean sf) {
+        List<Object> providers = new ArrayList<>();
+        providers.add(new JSONObjWriter());
+        sf.setProviders(providers);
+    }
+
+    @Override
+    protected InputStream getTikaConfigInputStream() {
+        return new 
ByteArrayInputStream(TIKA_CONFIG_XML.getBytes(StandardCharsets.UTF_8));
+    }
+
+    @Override
+    protected InputStreamFactory getInputStreamFactory(TikaConfig tikaConfig) {
+        return new FetcherStreamFactory(tikaConfig.getFetcher());
+    }
+
+    @Test
+    public void testGet() throws Exception {
+        String q = "?fetchString="+ URLEncoder.encode("fs:hello_world.xml", 
StandardCharsets.UTF_8.name());
+        String getUrl = endPoint+EMITTER_PATH+q;
+        Response response = WebClient
+                .create(getUrl)
+                .accept("application/json").get();
+        assertEquals(200, response.getStatus());
+        Path targetFile = TMP_OUTPUT_DIR.resolve("hello_world.xml.json");
+        List<Metadata> metadataList = null;
+        try (Reader reader = Files.newBufferedReader(targetFile)) {
+            metadataList = JsonMetadataList.fromJson(reader);
+        }
+        assertEquals(1, metadataList.size());
+        Metadata metadata = metadataList.get(0);
+        assertEquals("hello world",
+                
metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).trim());
+        assertEquals("Nikolai Lobachevsky", metadata.get("author"));
+        assertEquals("你好,世界", metadata.get("title"));
+        assertEquals("application/mock+xml", 
metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+    //TODO: add put and post
+
+}

Reply via email to