This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push: new ea19d62 TIKA-2343 -- add boilerpipe option (tika-app's "text-main") to tika-server new 3cc4bc2 Merge remote-tracking branch 'origin/master' ea19d62 is described below commit ea19d622bb76d2dab7561d6f9cffd7667b92e75b Author: tballison <talli...@mitre.org> AuthorDate: Wed May 3 21:23:39 2017 -0400 TIKA-2343 -- add boilerpipe option (tika-app's "text-main") to tika-server --- CHANGES.txt | 3 ++ .../apache/tika/server/resource/TikaResource.java | 43 ++++++++++++++++++++++ .../org/apache/tika/server/TikaResourceTest.java | 31 +++++++++++++++- tika-server/src/test/resources/testHTML.html | 28 ++++++++++++++ 4 files changed, 104 insertions(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index 8348be7..de5606d 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -34,6 +34,9 @@ Release 1.15 - ?? * Bug fix for WordPerfect via Pascal Essiembre (TIKA-2352). + * Added "text-main" equivalent option to tika-server via + /tika/main (TIKA-2343). + * Enabled configuration of the EncodingDetector used by parsers that extend AbstractEncodingDetectorParser (TIKA-2273). diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java index 288d11d..55f0160 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java @@ -67,6 +67,7 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.parser.html.BoilerpipeContentHandler; import org.apache.tika.parser.html.HtmlParser; import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.pdf.PDFParserConfig; @@ -339,6 +340,48 @@ public class TikaResource { return produceText(att.getObject(InputStream.class), att.getHeaders(), info); } + //this is equivalent to text-main in tika-app + @PUT + @Consumes("*/*") + @Produces("text/plain") + @Path("main") + public StreamingOutput getTextMain(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) { + return produceTextMain(is, httpHeaders.getRequestHeaders(), info); + } + + //this is equivalent to text-main (Boilerpipe handler) in tika-app + @PUT + @Consumes("multipart/form-data") + @Produces("text/plain") + @Path("form/main") + public StreamingOutput getTextMainFromMultipart(final Attachment att, @Context final UriInfo info) { + return produceTextMain(att.getObject(InputStream.class), att.getHeaders(), info); + } + + public StreamingOutput produceTextMain(final InputStream is, @Context MultivaluedMap<String, String> httpHeaders, @Context final UriInfo info) { + final Parser parser = createParser(); + final Metadata metadata = new Metadata(); + final ParseContext context = new ParseContext(); + + fillMetadata(parser, metadata, context, httpHeaders); + fillParseContext(context, httpHeaders, parser); + + logRequest(LOG, info, metadata); + + return new StreamingOutput() { + public void write(OutputStream outputStream) throws IOException, WebApplicationException { + Writer writer = new OutputStreamWriter(outputStream, UTF_8); + + ContentHandler handler = new BoilerpipeContentHandler(writer); + + try (InputStream inputStream = is) { + parse(parser, LOG, info.getPath(), inputStream, handler, metadata, context); + } + } + }; + } + + @PUT @Consumes("*/*") @Produces("text/plain") diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java index be5092a..7df879e 100644 --- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java +++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java @@ -18,6 +18,7 @@ package org.apache.tika.server; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import javax.ws.rs.core.Response; @@ -36,7 +37,6 @@ import org.junit.Test; public class TikaResourceTest extends CXFTestBase { public static final String TEST_DOC = "test.doc"; - public static final String TEST_XLSX = "16637.xlsx"; public static final String TEST_PASSWORD_PROTECTED = "password.xls"; private static final String TEST_RECURSIVE_DOC = "test_recursive_embedded.docx"; @@ -77,6 +77,35 @@ public class TikaResourceTest extends CXFTestBase { } @Test + public void testTextMain() throws Exception { + //boilerpipe + Response response = WebClient.create(endPoint + TIKA_PATH + "/main") + .accept("text/plain") + .put(ClassLoader.getSystemResourceAsStream("testHTML.html")); + String responseMsg = getStringFromInputStream((InputStream) response + .getEntity()); + assertTrue(responseMsg.contains("Title : Test Indexation Html")); + assertFalse(responseMsg.contains("Indexation du fichier")); + } + + @Test + public void testTextMainMultipart() throws Exception { + //boilerpipe + Attachment attachmentPart = + new Attachment("myhtml", "text/html", ClassLoader.getSystemResourceAsStream("testHTML.html")); + + + Response response = WebClient.create(endPoint + TIKA_PATH+"/form/main") + .type("multipart/form-data") + .accept("text/plain") + .put(attachmentPart); + String responseMsg = getStringFromInputStream((InputStream) response + .getEntity()); + assertTrue(responseMsg.contains("Title : Test Indexation Html")); + assertFalse(responseMsg.contains("Indexation du fichier")); + } + + @Test public void testApplicationWadl() throws Exception { Response response = WebClient .create(endPoint + TIKA_PATH + "?_wadl") diff --git a/tika-server/src/test/resources/testHTML.html b/tika-server/src/test/resources/testHTML.html new file mode 100644 index 0000000..5bbd4d8 --- /dev/null +++ b/tika-server/src/test/resources/testHTML.html @@ -0,0 +1,28 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<html> + <head> + <title>Title : Test Indexation Html</title> + <meta name="Author" content="Tika Developers"> + <meta name="ICBM" content="51.2312, -5.1987"> + <meta http-equiv="refresh" content="5"> + </head> + <body> + <h1><a name="test-anchor"></a>Test Indexation Html</h1> + <p><a href="http://www.apache.org/">Indexation</a> du fichier</p> + </body> +</html> -- To stop receiving notification emails like this one, please contact ['"commits@tika.apache.org" <commits@tika.apache.org>'].