This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new ea19d62 TIKA-2343 -- add boilerpipe option (tika-app's
"text-main") to tika-server
new 3cc4bc2 Merge remote-tracking branch 'origin/master'
ea19d62 is described below
commit ea19d622bb76d2dab7561d6f9cffd7667b92e75b
Author: tballison <[email protected]>
AuthorDate: Wed May 3 21:23:39 2017 -0400
TIKA-2343 -- add boilerpipe option (tika-app's "text-main") to tika-server
---
CHANGES.txt | 3 ++
.../apache/tika/server/resource/TikaResource.java | 43 ++++++++++++++++++++++
.../org/apache/tika/server/TikaResourceTest.java | 31 +++++++++++++++-
tika-server/src/test/resources/testHTML.html | 28 ++++++++++++++
4 files changed, 104 insertions(+), 1 deletion(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 8348be7..de5606d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -34,6 +34,9 @@ Release 1.15 - ??
* Bug fix for WordPerfect via Pascal Essiembre (TIKA-2352).
+ * Added "text-main" equivalent option to tika-server via
+ /tika/main (TIKA-2343).
+
* Enabled configuration of the EncodingDetector used by
parsers that extend AbstractEncodingDetectorParser (TIKA-2273).
diff --git
a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index 288d11d..55f0160 100644
---
a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++
b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -67,6 +67,7 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
@@ -339,6 +340,48 @@ public class TikaResource {
return produceText(att.getObject(InputStream.class), att.getHeaders(),
info);
}
+ //this is equivalent to text-main in tika-app
+ @PUT
+ @Consumes("*/*")
+ @Produces("text/plain")
+ @Path("main")
+ public StreamingOutput getTextMain(final InputStream is, @Context
HttpHeaders httpHeaders, @Context final UriInfo info) {
+ return produceTextMain(is, httpHeaders.getRequestHeaders(), info);
+ }
+
+ //this is equivalent to text-main (Boilerpipe handler) in tika-app
+ @PUT
+ @Consumes("multipart/form-data")
+ @Produces("text/plain")
+ @Path("form/main")
+ public StreamingOutput getTextMainFromMultipart(final Attachment att,
@Context final UriInfo info) {
+ return produceTextMain(att.getObject(InputStream.class),
att.getHeaders(), info);
+ }
+
+ public StreamingOutput produceTextMain(final InputStream is, @Context
MultivaluedMap<String, String> httpHeaders, @Context final UriInfo info) {
+ final Parser parser = createParser();
+ final Metadata metadata = new Metadata();
+ final ParseContext context = new ParseContext();
+
+ fillMetadata(parser, metadata, context, httpHeaders);
+ fillParseContext(context, httpHeaders, parser);
+
+ logRequest(LOG, info, metadata);
+
+ return new StreamingOutput() {
+ public void write(OutputStream outputStream) throws IOException,
WebApplicationException {
+ Writer writer = new OutputStreamWriter(outputStream, UTF_8);
+
+ ContentHandler handler = new BoilerpipeContentHandler(writer);
+
+ try (InputStream inputStream = is) {
+ parse(parser, LOG, info.getPath(), inputStream, handler,
metadata, context);
+ }
+ }
+ };
+ }
+
+
@PUT
@Consumes("*/*")
@Produces("text/plain")
diff --git
a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index be5092a..7df879e 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -18,6 +18,7 @@
package org.apache.tika.server;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import javax.ws.rs.core.Response;
@@ -36,7 +37,6 @@ import org.junit.Test;
public class TikaResourceTest extends CXFTestBase {
public static final String TEST_DOC = "test.doc";
- public static final String TEST_XLSX = "16637.xlsx";
public static final String TEST_PASSWORD_PROTECTED = "password.xls";
private static final String TEST_RECURSIVE_DOC =
"test_recursive_embedded.docx";
@@ -77,6 +77,35 @@ public class TikaResourceTest extends CXFTestBase {
}
@Test
+ public void testTextMain() throws Exception {
+ //boilerpipe
+ Response response = WebClient.create(endPoint + TIKA_PATH + "/main")
+ .accept("text/plain")
+ .put(ClassLoader.getSystemResourceAsStream("testHTML.html"));
+ String responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+ assertTrue(responseMsg.contains("Title : Test Indexation Html"));
+ assertFalse(responseMsg.contains("Indexation du fichier"));
+ }
+
+ @Test
+ public void testTextMainMultipart() throws Exception {
+ //boilerpipe
+ Attachment attachmentPart =
+ new Attachment("myhtml", "text/html",
ClassLoader.getSystemResourceAsStream("testHTML.html"));
+
+
+ Response response = WebClient.create(endPoint + TIKA_PATH+"/form/main")
+ .type("multipart/form-data")
+ .accept("text/plain")
+ .put(attachmentPart);
+ String responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+ assertTrue(responseMsg.contains("Title : Test Indexation Html"));
+ assertFalse(responseMsg.contains("Indexation du fichier"));
+ }
+
+ @Test
public void testApplicationWadl() throws Exception {
Response response = WebClient
.create(endPoint + TIKA_PATH + "?_wadl")
diff --git a/tika-server/src/test/resources/testHTML.html
b/tika-server/src/test/resources/testHTML.html
new file mode 100644
index 0000000..5bbd4d8
--- /dev/null
+++ b/tika-server/src/test/resources/testHTML.html
@@ -0,0 +1,28 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+ <head>
+ <title>Title : Test Indexation Html</title>
+ <meta name="Author" content="Tika Developers">
+ <meta name="ICBM" content="51.2312, -5.1987">
+ <meta http-equiv="refresh" content="5">
+ </head>
+ <body>
+ <h1><a name="test-anchor"></a>Test Indexation Html</h1>
+ <p><a href="http://www.apache.org/">Indexation</a> du
fichier</p>
+ </body>
+</html>
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].