Author: mattmann
Date: Sun Jan 27 19:18:26 2013
New Revision: 1439144

URL: http://svn.apache.org/viewvc?rev=1439144&view=rev
Log:
Apply patch from Raimund Merkert and Chris Mattmann for TIKA-1047: Provide a 
JAX-RS to detect only mediatype.

Added:
    
tika/trunk/tika-server/src/main/java/org/apache/tika/server/CSVMessageBodyWriter.java
    tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java
    
tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataEPTest.java

Added: 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/CSVMessageBodyWriter.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/CSVMessageBodyWriter.java?rev=1439144&view=auto
==============================================================================
--- 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/CSVMessageBodyWriter.java
 (added)
+++ 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/CSVMessageBodyWriter.java
 Sun Jan 27 19:18:26 2013
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server;
+
+import org.apache.tika.metadata.Metadata;
+
+import javax.ws.rs.Produces;
+import javax.ws.rs.WebApplicationException;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.MultivaluedMap;
+import javax.ws.rs.ext.MessageBodyWriter;
+import javax.ws.rs.ext.Provider;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.lang.annotation.Annotation;
+import java.lang.reflect.Type;
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import au.com.bytecode.opencsv.CSVWriter;
+
+@Provider
+@Produces("text/csv")
+public class CSVMessageBodyWriter implements MessageBodyWriter<Metadata> {
+
+  public boolean isWriteable(Class<?> type, Type genericType, Annotation[] 
annotations, MediaType mediaType) {
+    return Metadata.class.isAssignableFrom(type);
+  }
+
+  public long getSize(Metadata data, Class<?> type, Type genericType, 
Annotation[] annotations, MediaType mediaType) {
+    return -1;
+  }
+
+  @Override
+  public void writeTo(Metadata metadata, Class<?> type, Type genericType, 
Annotation[] annotations,
+      MediaType mediaType, MultivaluedMap<String, Object> httpHeaders, 
OutputStream entityStream) throws IOException,
+      WebApplicationException {
+
+    CSVWriter writer = new CSVWriter(new OutputStreamWriter(entityStream, 
"UTF-8"));
+
+    for (String name : metadata.names()) {
+      String[] values = metadata.getValues(name);
+      ArrayList<String> list = new ArrayList<String>(values.length + 1);
+      list.add(name);
+      list.addAll(Arrays.asList(values));
+      writer.writeNext(list.toArray(values));
+    }
+    // don't close, just flush the stream
+    writer.flush();
+  }
+}

Added: 
tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java?rev=1439144&view=auto
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java 
(added)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java 
Sun Jan 27 19:18:26 2013
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.xml.sax.helpers.DefaultHandler;
+
+import javax.ws.rs.POST;
+import javax.ws.rs.Path;
+import javax.ws.rs.PathParam;
+import javax.ws.rs.Produces;
+import javax.ws.rs.core.*;
+import javax.ws.rs.core.Response.Status;
+
+import java.io.InputStream;
+
+/**
+ * This JAX-RS endpoint provides access to the metadata contained within a
+ * document. It is possible to submit a relatively small prefix (a few KB) of a
+ * document's content to retrieve individual metadata fields.
+ * <p>
+ */
+@Path("/metadata")
+public class MetadataEP {
+  private static final Log logger = LogFactory.getLog(MetadataEP.class);
+
+  /** The parser to use */
+  private final AutoDetectParser parser;
+
+  /** The metdata for the request */
+  private final Metadata metadata = new Metadata();
+
+  public MetadataEP(@Context HttpHeaders httpHeaders, @Context UriInfo info) {
+    parser = TikaResource.createParser();
+    TikaResource.fillMetadata(parser, metadata, httpHeaders);
+    TikaResource.logRequest(logger, info, metadata);
+  }
+
+  /**
+   * Get all metadata that can be parsed from the specified input stream. An
+   * error is produced if the input stream cannot be parsed.
+   * 
+   * @param is
+   *          an input stream
+   * @return the metadata
+   * @throws Exception
+   */
+  @POST
+  public Response getMetadata(InputStream is) throws Exception {
+    parser.parse(is, new DefaultHandler(), metadata);
+    return Response.ok(metadata).build();
+  }
+
+  /**
+   * Get a specific TIKA metadata field as a simple text string. If the field 
is
+   * multivalued, then only the first value is returned. If the input stream
+   * cannot be parsed, but a value was found for the given metadata field, then
+   * the value of the field is returned as part of a 200 OK response; otherwise
+   * a {@link Status#BAD_REQUEST} is generated. If the stream was successfully
+   * parsed but the specific metadata field was not found, then a
+   * {@link Status#NOT_FOUND} is returned.
+   * <p>
+   * 
+   * @param field
+   *          the tika metadata field name
+   * @param is
+   *          the document stream
+   * @return one of {@link Status#OK}, {@link Status#NOT_FOUND}, or
+   *         {@link Status#BAD_REQUEST}
+   * @throws Exception
+   */
+  @POST
+  @Path("{field}")
+  @Produces(MediaType.TEXT_PLAIN)
+  public Response getSimpleMetadataField(@PathParam("field") String field, 
InputStream is) throws Exception {
+
+    // use BAD request to indicate that we may not have had enough data to
+    // process the request
+    Status defaultErrorResponse = Status.BAD_REQUEST;
+    try {
+      parser.parse(is, new DefaultHandler(), metadata);
+      // once we've parsed the document successfully, we should use NOT_FOUND
+      // if we did not see the field
+      defaultErrorResponse = Status.NOT_FOUND;
+    } catch (Exception e) {
+      logger.info("Failed to process field " + field, e);
+    }
+    String value = metadata.get(field);
+    if (value == null) {
+      return Response.status(defaultErrorResponse).entity("Failed to get 
metadata field " + field).build();
+    }
+    return Response.ok(value, MediaType.TEXT_PLAIN_TYPE).build();
+  }
+
+  /**
+   * Get a specific metadata field. If the input stream cannot be parsed, but a
+   * value was found for the given metadata field, then the value of the field
+   * is returned as part of a 200 OK response; otherwise a
+   * {@link Status#BAD_REQUEST} is generated. If the stream was successfully
+   * parsed but the specific metadata field was not found, then a
+   * {@link Status#NOT_FOUND} is returned.
+   * <p>
+   * Note that this method handles multivalue fields and returns possibly more
+   * metadata than requested.
+   * 
+   * @param field
+   *          the tika metadata field name
+   * @param is
+   *          the document stream
+   * @return one of {@link Status#OK}, {@link Status#NOT_FOUND}, or
+   *         {@link Status#BAD_REQUEST}
+   * @throws Exception
+   */
+  @POST
+  @Path("{field}")
+  public Response getMetadataField(@PathParam("field") String field, 
InputStream is) throws Exception {
+
+    // use BAD request to indicate that we may not have had enough data to
+    // process the request
+    Status defaultErrorResponse = Status.BAD_REQUEST;
+    try {
+      parser.parse(is, new DefaultHandler(), metadata);
+      // once we've parsed the document successfully, we should use NOT_FOUND
+      // if we did not see the field
+      defaultErrorResponse = Status.NOT_FOUND;
+    } catch (Exception e) {
+      logger.info("Failed to process field " + field, e);
+    }
+    String[] values = metadata.getValues(field);
+    if (values.length == 0) {
+      return Response.status(defaultErrorResponse).entity("Failed to get 
metadata field " + field).build();
+    }
+    // remove fields we don't care about for the response
+    for (String name : metadata.names()) {
+      if (!field.equals(name)) {
+        metadata.remove(name);
+      }
+    }
+    return Response.ok(metadata).build();
+  }
+
+}

Added: 
tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataEPTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataEPTest.java?rev=1439144&view=auto
==============================================================================
--- 
tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataEPTest.java 
(added)
+++ 
tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataEPTest.java 
Sun Jan 27 19:18:26 2013
@@ -0,0 +1,185 @@
+package org.apache.tika.server;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.cxf.binding.BindingFactoryManager;
+import org.apache.cxf.endpoint.Server;
+import org.apache.cxf.jaxrs.JAXRSBindingFactory;
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.tika.io.IOUtils;
+import org.eclipse.jetty.util.ajax.JSON;
+import org.junit.Assert;
+import org.junit.Test;
+
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+import javax.ws.rs.core.Response.Status;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import au.com.bytecode.opencsv.CSVReader;
+
+public class MetadataEPTest extends CXFTestBase {
+  private static final String META_PATH = "/metadata";
+
+  private static final String endPoint = "http://localhost:"; + 
TikaServerCli.DEFAULT_PORT;
+
+  private Server server;
+
+  private static InputStream copy(InputStream in, int remaining) throws 
IOException {
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    while (remaining > 0) {
+      byte[] bytes = new byte[remaining];
+      int n = in.read(bytes);
+      if (n <= 0) {
+        break;
+      }
+      out.write(bytes, 0, n);
+      remaining -= n;
+    }
+    return new ByteArrayInputStream(out.toByteArray());
+  }
+
+  /*
+   * (non-Javadoc)
+   * 
+   * @see junit.framework.TestCase#setUp()
+   */
+  @Override
+  protected void setUp() throws Exception {
+    JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
+    sf.setResourceClasses(MetadataEP.class);
+    List providers = new ArrayList();
+    providers.add(new CSVMessageBodyWriter());
+    providers.add(new JSONMessageBodyWriter());
+    sf.setProviders(providers);
+    sf.setAddress(endPoint + "/");
+    BindingFactoryManager manager = 
sf.getBus().getExtension(BindingFactoryManager.class);
+    JAXRSBindingFactory factory = new JAXRSBindingFactory();
+    factory.setBus(sf.getBus());
+    manager.registerBindingFactory(JAXRSBindingFactory.JAXRS_BINDING_ID, 
factory);
+    server = sf.create();
+  }
+
+  /*
+   * (non-Javadoc)
+   * 
+   * @see junit.framework.TestCase#tearDown()
+   */
+  @Override
+  protected void tearDown() throws Exception {
+    server.stop();
+    server.destroy();
+  }
+
+  @Test
+  public void testSimpleWord_CSV() throws Exception {
+    Response response = WebClient.create(endPoint + 
META_PATH).type("application/msword").accept("text/csv")
+        
.post(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
+    Assert.assertEquals(Status.OK.getStatusCode(), response.getStatus());
+
+    Reader reader = new InputStreamReader((InputStream) response.getEntity());
+
+    CSVReader csvReader = new CSVReader(reader);
+
+    Map<String, String> metadata = new HashMap<String, String>();
+
+    String[] nextLine;
+    while ((nextLine = csvReader.readNext()) != null) {
+      metadata.put(nextLine[0], nextLine[1]);
+    }
+
+    assertNotNull(metadata.get("Author"));
+    assertEquals("Maxim Valyanskiy", metadata.get("Author"));
+  }
+
+  @Test
+  public void testSimpleWord_JSON() throws Exception {
+    Response response = WebClient.create(endPoint + 
META_PATH).type("application/msword")
+        
.accept(MediaType.APPLICATION_JSON).post(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
+
+    Assert.assertEquals(Status.OK.getStatusCode(), response.getStatus());
+
+    Reader reader = new InputStreamReader((InputStream) response.getEntity());
+    Map metadata = (Map) JSON.parse(reader);
+
+    assertNotNull(metadata.get("Author"));
+    assertEquals("Maxim Valyanskiy", metadata.get("Author"));
+  }
+
+  @Test
+  public void testGetField_Author_TEXT() throws Exception {
+    Response response = WebClient.create(endPoint + META_PATH + 
"/Author").type("application/msword")
+        
.accept(MediaType.TEXT_PLAIN).post(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
+    Assert.assertEquals(Status.OK.getStatusCode(), response.getStatus());
+
+    StringWriter w = new StringWriter();
+    IOUtils.copy((InputStream) response.getEntity(), w);
+    assertEquals("Maxim Valyanskiy", w.toString());
+  }
+
+  @Test
+  public void testGetField_Author_JSON() throws Exception {
+    Response response = WebClient.create(endPoint + META_PATH + 
"/Author").type("application/msword")
+        
.accept(MediaType.APPLICATION_JSON).post(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
+    Assert.assertEquals(Status.OK.getStatusCode(), response.getStatus());
+
+    Reader reader = new InputStreamReader((InputStream) response.getEntity());
+    Map metadata = (Map) JSON.parse(reader);
+
+    assertNotNull(metadata.get("Author"));
+    assertEquals("Maxim Valyanskiy", metadata.get("Author"));
+  }
+
+  @Test
+  public void testGetField_XXX_NotFound() throws Exception {
+    Response response = WebClient.create(endPoint + META_PATH + 
"/xxx").type("application/msword")
+        
.accept(MediaType.APPLICATION_JSON).post(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
+    Assert.assertEquals(Status.NOT_FOUND.getStatusCode(), 
response.getStatus());
+  }
+
+  @Test
+  public void testGetField_Author_TEXT_Partial_BAD_REQUEST() throws Exception {
+
+    InputStream stream = 
ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC);
+
+    Response response = WebClient.create(endPoint + META_PATH + 
"/Author").type("application/msword")
+        .accept(MediaType.TEXT_PLAIN).post(copy(stream, 8000));
+    Assert.assertEquals(Status.BAD_REQUEST.getStatusCode(), 
response.getStatus());
+  }
+
+  @Test
+  public void testGetField_Author_TEXT_Partial_Found() throws Exception {
+
+    InputStream stream = 
ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC);
+
+    Response response = WebClient.create(endPoint + META_PATH + 
"/Author").type("application/msword")
+        .accept(MediaType.TEXT_PLAIN).post(copy(stream, 12000));
+    Assert.assertEquals(Status.OK.getStatusCode(), response.getStatus());
+
+    StringWriter w = new StringWriter();
+    IOUtils.copy((InputStream) response.getEntity(), w);
+    assertEquals("Maxim Valyanskiy", w.toString());
+  }
+
+}


Reply via email to