Author: dmeikle
Date: Sun May 26 11:28:51 2013
New Revision: 1486409
URL: http://svn.apache.org/r1486409
Log:
TIKA-1126 - Patch by Ali Mosavian to allow Tika Server to produce text/html
output
Modified:
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
Modified:
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java?rev=1486409&r1=1486408&r2=1486409&view=diff
==============================================================================
---
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java
(original)
+++
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java
Sun May 26 11:28:51 2013
@@ -33,6 +33,7 @@ import org.apache.tika.parser.ParseConte
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -40,6 +41,11 @@ import javax.mail.internet.ContentDispos
import javax.mail.internet.ParseException;
import javax.ws.rs.*;
import javax.ws.rs.core.*;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.util.List;
@@ -201,6 +207,77 @@ public class TikaResource {
};
}
+
+ @PUT
+ @Consumes("*/*")
+ @Produces("text/html")
+ public StreamingOutput getHTML(final InputStream is, @Context HttpHeaders
httpHeaders, @Context final UriInfo info) {
+ final AutoDetectParser parser = createParser();
+ final Metadata metadata = new Metadata();
+
+ fillMetadata(parser, metadata, httpHeaders);
+
+ logRequest(logger, info, metadata);
+
+ return new StreamingOutput() {
+ public void write(OutputStream outputStream)
+ throws IOException, WebApplicationException {
+ Writer writer = new OutputStreamWriter(outputStream, "UTF-8");
+ ContentHandler content;
+
+ try {
+ SAXTransformerFactory factory =
(SAXTransformerFactory)SAXTransformerFactory.newInstance( );
+ TransformerHandler handler =
factory.newTransformerHandler( );
+
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+ handler.setResult(new StreamResult(writer));
+ content = new ExpandedTitleContentHandler( handler );
+ }
+ catch ( TransformerConfigurationException e ) {
+ throw new WebApplicationException( e );
+ }
+
+ TikaInputStream tis = TikaInputStream.get(is);
+
+ try {
+ tis.getFile();
+ parser.parse(tis, content, metadata);
+ }
+ catch (SAXException e) {
+ throw new WebApplicationException(e);
+ }
+ catch (EncryptedDocumentException e) {
+ logger.warn(String.format(
+ "%s: Encrypted document",
+ info.getPath()
+ ), e);
+ throw new WebApplicationException(e,
Response.status(422).build());
+ }
+ catch (TikaException e) {
+ logger.warn(String.format(
+ "%s: Text extraction failed",
+ info.getPath()
+ ), e);
+
+ if (e.getCause()!=null && e.getCause() instanceof
WebApplicationException)
+ throw (WebApplicationException) e.getCause();
+
+ if (e.getCause()!=null && e.getCause() instanceof
IllegalStateException)
+ throw new
WebApplicationException(Response.status(422).build());
+
+ if (e.getCause()!=null && e.getCause() instanceof
OldWordFileFormatException)
+ throw new
WebApplicationException(Response.status(422).build());
+
+ throw new
WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
+ }
+ finally {
+ tis.close();
+ }
+ }
+ };
+ }
+
public static void logRequest(Log logger, UriInfo info, Metadata metadata) {
if (metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE)==null)
{
logger.info(String.format(
Modified:
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java?rev=1486409&r1=1486408&r2=1486409&view=diff
==============================================================================
---
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
(original)
+++
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
Sun May 26 11:28:51 2013
@@ -109,4 +109,25 @@ public class TikaResourceTest extends CX
assertEquals(UNPROCESSEABLE, response.getStatus());
}
+
+ @Test
+ public void testSimpleWordHTML() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/msword")
+ .accept("text/html")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_DOC));
+ String responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+ assertTrue(responseMsg.contains("test"));
+ }
+
+ @Test
+ public void testPasswordXLSHTML() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/vnd.ms-excel")
+ .accept("text/html")
+ .put(ClassLoader.getSystemResourceAsStream("password.xls"));
+
+ assertEquals(UNPROCESSEABLE, response.getStatus());
+ }
}