Author: mattmann
Date: Sun Jan 27 19:18:26 2013
New Revision: 1439144
URL: http://svn.apache.org/viewvc?rev=1439144&view=rev
Log:
Apply patch from Raimund Merkert and Chris Mattmann for TIKA-1047: Provide a
JAX-RS to detect only mediatype.
Added:
tika/trunk/tika-server/src/main/java/org/apache/tika/server/CSVMessageBodyWriter.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataEPTest.java
Added:
tika/trunk/tika-server/src/main/java/org/apache/tika/server/CSVMessageBodyWriter.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/CSVMessageBodyWriter.java?rev=1439144&view=auto
==============================================================================
---
tika/trunk/tika-server/src/main/java/org/apache/tika/server/CSVMessageBodyWriter.java
(added)
+++
tika/trunk/tika-server/src/main/java/org/apache/tika/server/CSVMessageBodyWriter.java
Sun Jan 27 19:18:26 2013
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server;
+
+import org.apache.tika.metadata.Metadata;
+
+import javax.ws.rs.Produces;
+import javax.ws.rs.WebApplicationException;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.MultivaluedMap;
+import javax.ws.rs.ext.MessageBodyWriter;
+import javax.ws.rs.ext.Provider;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.lang.annotation.Annotation;
+import java.lang.reflect.Type;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import au.com.bytecode.opencsv.CSVWriter;
+
+@Provider
+@Produces("text/csv")
+public class CSVMessageBodyWriter implements MessageBodyWriter<Metadata> {
+
+ public boolean isWriteable(Class<?> type, Type genericType, Annotation[]
annotations, MediaType mediaType) {
+ return Metadata.class.isAssignableFrom(type);
+ }
+
+ public long getSize(Metadata data, Class<?> type, Type genericType,
Annotation[] annotations, MediaType mediaType) {
+ return -1;
+ }
+
+ @Override
+ public void writeTo(Metadata metadata, Class<?> type, Type genericType,
Annotation[] annotations,
+ MediaType mediaType, MultivaluedMap<String, Object> httpHeaders,
OutputStream entityStream) throws IOException,
+ WebApplicationException {
+
+ CSVWriter writer = new CSVWriter(new OutputStreamWriter(entityStream,
"UTF-8"));
+
+ for (String name : metadata.names()) {
+ String[] values = metadata.getValues(name);
+ ArrayList<String> list = new ArrayList<String>(values.length + 1);
+ list.add(name);
+ list.addAll(Arrays.asList(values));
+ writer.writeNext(list.toArray(values));
+ }
+ // don't close, just flush the stream
+ writer.flush();
+ }
+}
Added:
tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java?rev=1439144&view=auto
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java
(added)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java
Sun Jan 27 19:18:26 2013
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.xml.sax.helpers.DefaultHandler;
+
+import javax.ws.rs.POST;
+import javax.ws.rs.Path;
+import javax.ws.rs.PathParam;
+import javax.ws.rs.Produces;
+import javax.ws.rs.core.*;
+import javax.ws.rs.core.Response.Status;
+
+import java.io.InputStream;
+
+/**
+ * This JAX-RS endpoint provides access to the metadata contained within a
+ * document. It is possible to submit a relatively small prefix (a few KB) of a
+ * document's content to retrieve individual metadata fields.
+ * <p>
+ */
+@Path("/metadata")
+public class MetadataEP {
+ private static final Log logger = LogFactory.getLog(MetadataEP.class);
+
+ /** The parser to use */
+ private final AutoDetectParser parser;
+
+ /** The metdata for the request */
+ private final Metadata metadata = new Metadata();
+
+ public MetadataEP(@Context HttpHeaders httpHeaders, @Context UriInfo info) {
+ parser = TikaResource.createParser();
+ TikaResource.fillMetadata(parser, metadata, httpHeaders);
+ TikaResource.logRequest(logger, info, metadata);
+ }
+
+ /**
+ * Get all metadata that can be parsed from the specified input stream. An
+ * error is produced if the input stream cannot be parsed.
+ *
+ * @param is
+ * an input stream
+ * @return the metadata
+ * @throws Exception
+ */
+ @POST
+ public Response getMetadata(InputStream is) throws Exception {
+ parser.parse(is, new DefaultHandler(), metadata);
+ return Response.ok(metadata).build();
+ }
+
+ /**
+ * Get a specific TIKA metadata field as a simple text string. If the field
is
+ * multivalued, then only the first value is returned. If the input stream
+ * cannot be parsed, but a value was found for the given metadata field, then
+ * the value of the field is returned as part of a 200 OK response; otherwise
+ * a {@link Status#BAD_REQUEST} is generated. If the stream was successfully
+ * parsed but the specific metadata field was not found, then a
+ * {@link Status#NOT_FOUND} is returned.
+ * <p>
+ *
+ * @param field
+ * the tika metadata field name
+ * @param is
+ * the document stream
+ * @return one of {@link Status#OK}, {@link Status#NOT_FOUND}, or
+ * {@link Status#BAD_REQUEST}
+ * @throws Exception
+ */
+ @POST
+ @Path("{field}")
+ @Produces(MediaType.TEXT_PLAIN)
+ public Response getSimpleMetadataField(@PathParam("field") String field,
InputStream is) throws Exception {
+
+ // use BAD request to indicate that we may not have had enough data to
+ // process the request
+ Status defaultErrorResponse = Status.BAD_REQUEST;
+ try {
+ parser.parse(is, new DefaultHandler(), metadata);
+ // once we've parsed the document successfully, we should use NOT_FOUND
+ // if we did not see the field
+ defaultErrorResponse = Status.NOT_FOUND;
+ } catch (Exception e) {
+ logger.info("Failed to process field " + field, e);
+ }
+ String value = metadata.get(field);
+ if (value == null) {
+ return Response.status(defaultErrorResponse).entity("Failed to get
metadata field " + field).build();
+ }
+ return Response.ok(value, MediaType.TEXT_PLAIN_TYPE).build();
+ }
+
+ /**
+ * Get a specific metadata field. If the input stream cannot be parsed, but a
+ * value was found for the given metadata field, then the value of the field
+ * is returned as part of a 200 OK response; otherwise a
+ * {@link Status#BAD_REQUEST} is generated. If the stream was successfully
+ * parsed but the specific metadata field was not found, then a
+ * {@link Status#NOT_FOUND} is returned.
+ * <p>
+ * Note that this method handles multivalue fields and returns possibly more
+ * metadata than requested.
+ *
+ * @param field
+ * the tika metadata field name
+ * @param is
+ * the document stream
+ * @return one of {@link Status#OK}, {@link Status#NOT_FOUND}, or
+ * {@link Status#BAD_REQUEST}
+ * @throws Exception
+ */
+ @POST
+ @Path("{field}")
+ public Response getMetadataField(@PathParam("field") String field,
InputStream is) throws Exception {
+
+ // use BAD request to indicate that we may not have had enough data to
+ // process the request
+ Status defaultErrorResponse = Status.BAD_REQUEST;
+ try {
+ parser.parse(is, new DefaultHandler(), metadata);
+ // once we've parsed the document successfully, we should use NOT_FOUND
+ // if we did not see the field
+ defaultErrorResponse = Status.NOT_FOUND;
+ } catch (Exception e) {
+ logger.info("Failed to process field " + field, e);
+ }
+ String[] values = metadata.getValues(field);
+ if (values.length == 0) {
+ return Response.status(defaultErrorResponse).entity("Failed to get
metadata field " + field).build();
+ }
+ // remove fields we don't care about for the response
+ for (String name : metadata.names()) {
+ if (!field.equals(name)) {
+ metadata.remove(name);
+ }
+ }
+ return Response.ok(metadata).build();
+ }
+
+}
Added:
tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataEPTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataEPTest.java?rev=1439144&view=auto
==============================================================================
---
tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataEPTest.java
(added)
+++
tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataEPTest.java
Sun Jan 27 19:18:26 2013
@@ -0,0 +1,185 @@
+package org.apache.tika.server;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.cxf.binding.BindingFactoryManager;
+import org.apache.cxf.endpoint.Server;
+import org.apache.cxf.jaxrs.JAXRSBindingFactory;
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.tika.io.IOUtils;
+import org.eclipse.jetty.util.ajax.JSON;
+import org.junit.Assert;
+import org.junit.Test;
+
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+import javax.ws.rs.core.Response.Status;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import au.com.bytecode.opencsv.CSVReader;
+
+public class MetadataEPTest extends CXFTestBase {
+ private static final String META_PATH = "/metadata";
+
+ private static final String endPoint = "http://localhost:" +
TikaServerCli.DEFAULT_PORT;
+
+ private Server server;
+
+ private static InputStream copy(InputStream in, int remaining) throws
IOException {
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ while (remaining > 0) {
+ byte[] bytes = new byte[remaining];
+ int n = in.read(bytes);
+ if (n <= 0) {
+ break;
+ }
+ out.write(bytes, 0, n);
+ remaining -= n;
+ }
+ return new ByteArrayInputStream(out.toByteArray());
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see junit.framework.TestCase#setUp()
+ */
+ @Override
+ protected void setUp() throws Exception {
+ JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
+ sf.setResourceClasses(MetadataEP.class);
+ List providers = new ArrayList();
+ providers.add(new CSVMessageBodyWriter());
+ providers.add(new JSONMessageBodyWriter());
+ sf.setProviders(providers);
+ sf.setAddress(endPoint + "/");
+ BindingFactoryManager manager =
sf.getBus().getExtension(BindingFactoryManager.class);
+ JAXRSBindingFactory factory = new JAXRSBindingFactory();
+ factory.setBus(sf.getBus());
+ manager.registerBindingFactory(JAXRSBindingFactory.JAXRS_BINDING_ID,
factory);
+ server = sf.create();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see junit.framework.TestCase#tearDown()
+ */
+ @Override
+ protected void tearDown() throws Exception {
+ server.stop();
+ server.destroy();
+ }
+
+ @Test
+ public void testSimpleWord_CSV() throws Exception {
+ Response response = WebClient.create(endPoint +
META_PATH).type("application/msword").accept("text/csv")
+
.post(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
+ Assert.assertEquals(Status.OK.getStatusCode(), response.getStatus());
+
+ Reader reader = new InputStreamReader((InputStream) response.getEntity());
+
+ CSVReader csvReader = new CSVReader(reader);
+
+ Map<String, String> metadata = new HashMap<String, String>();
+
+ String[] nextLine;
+ while ((nextLine = csvReader.readNext()) != null) {
+ metadata.put(nextLine[0], nextLine[1]);
+ }
+
+ assertNotNull(metadata.get("Author"));
+ assertEquals("Maxim Valyanskiy", metadata.get("Author"));
+ }
+
+ @Test
+ public void testSimpleWord_JSON() throws Exception {
+ Response response = WebClient.create(endPoint +
META_PATH).type("application/msword")
+
.accept(MediaType.APPLICATION_JSON).post(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
+
+ Assert.assertEquals(Status.OK.getStatusCode(), response.getStatus());
+
+ Reader reader = new InputStreamReader((InputStream) response.getEntity());
+ Map metadata = (Map) JSON.parse(reader);
+
+ assertNotNull(metadata.get("Author"));
+ assertEquals("Maxim Valyanskiy", metadata.get("Author"));
+ }
+
+ @Test
+ public void testGetField_Author_TEXT() throws Exception {
+ Response response = WebClient.create(endPoint + META_PATH +
"/Author").type("application/msword")
+
.accept(MediaType.TEXT_PLAIN).post(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
+ Assert.assertEquals(Status.OK.getStatusCode(), response.getStatus());
+
+ StringWriter w = new StringWriter();
+ IOUtils.copy((InputStream) response.getEntity(), w);
+ assertEquals("Maxim Valyanskiy", w.toString());
+ }
+
+ @Test
+ public void testGetField_Author_JSON() throws Exception {
+ Response response = WebClient.create(endPoint + META_PATH +
"/Author").type("application/msword")
+
.accept(MediaType.APPLICATION_JSON).post(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
+ Assert.assertEquals(Status.OK.getStatusCode(), response.getStatus());
+
+ Reader reader = new InputStreamReader((InputStream) response.getEntity());
+ Map metadata = (Map) JSON.parse(reader);
+
+ assertNotNull(metadata.get("Author"));
+ assertEquals("Maxim Valyanskiy", metadata.get("Author"));
+ }
+
+ @Test
+ public void testGetField_XXX_NotFound() throws Exception {
+ Response response = WebClient.create(endPoint + META_PATH +
"/xxx").type("application/msword")
+
.accept(MediaType.APPLICATION_JSON).post(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
+ Assert.assertEquals(Status.NOT_FOUND.getStatusCode(),
response.getStatus());
+ }
+
+ @Test
+ public void testGetField_Author_TEXT_Partial_BAD_REQUEST() throws Exception {
+
+ InputStream stream =
ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC);
+
+ Response response = WebClient.create(endPoint + META_PATH +
"/Author").type("application/msword")
+ .accept(MediaType.TEXT_PLAIN).post(copy(stream, 8000));
+ Assert.assertEquals(Status.BAD_REQUEST.getStatusCode(),
response.getStatus());
+ }
+
+ @Test
+ public void testGetField_Author_TEXT_Partial_Found() throws Exception {
+
+ InputStream stream =
ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC);
+
+ Response response = WebClient.create(endPoint + META_PATH +
"/Author").type("application/msword")
+ .accept(MediaType.TEXT_PLAIN).post(copy(stream, 12000));
+ Assert.assertEquals(Status.OK.getStatusCode(), response.getStatus());
+
+ StringWriter w = new StringWriter();
+ IOUtils.copy((InputStream) response.getEntity(), w);
+ assertEquals("Maxim Valyanskiy", w.toString());
+ }
+
+}