This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

The following commit(s) were added to refs/heads/master by this push:
       new  ea19d62   TIKA-2343 -- add boilerpipe option (tika-app's 
"text-main") to tika-server
       new  3cc4bc2   Merge remote-tracking branch 'origin/master'
ea19d62 is described below

commit ea19d622bb76d2dab7561d6f9cffd7667b92e75b
Author: tballison <talli...@mitre.org>
AuthorDate: Wed May 3 21:23:39 2017 -0400

    TIKA-2343 -- add boilerpipe option (tika-app's "text-main") to tika-server
---
 CHANGES.txt                                        |  3 ++
 .../apache/tika/server/resource/TikaResource.java  | 43 ++++++++++++++++++++++
 .../org/apache/tika/server/TikaResourceTest.java   | 31 +++++++++++++++-
 tika-server/src/test/resources/testHTML.html       | 28 ++++++++++++++
 4 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 8348be7..de5606d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -34,6 +34,9 @@ Release 1.15 - ??
 
   * Bug fix for WordPerfect via Pascal Essiembre (TIKA-2352).
 
+  * Added "text-main" equivalent option to tika-server via
+    /tika/main (TIKA-2343).
+
   * Enabled configuration of the EncodingDetector used by
     parsers that extend AbstractEncodingDetectorParser (TIKA-2273).
 
diff --git 
a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java 
b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index 288d11d..55f0160 100644
--- 
a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ 
b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -67,6 +67,7 @@ import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
 import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
 import org.apache.tika.parser.html.HtmlParser;
 import org.apache.tika.parser.ocr.TesseractOCRConfig;
 import org.apache.tika.parser.pdf.PDFParserConfig;
@@ -339,6 +340,48 @@ public class TikaResource {
         return produceText(att.getObject(InputStream.class), att.getHeaders(), 
info);
     }
 
+    //this is equivalent to text-main in tika-app
+    @PUT
+    @Consumes("*/*")
+    @Produces("text/plain")
+    @Path("main")
+    public StreamingOutput getTextMain(final InputStream is, @Context 
HttpHeaders httpHeaders, @Context final UriInfo info) {
+        return produceTextMain(is, httpHeaders.getRequestHeaders(), info);
+    }
+
+    //this is equivalent to text-main (Boilerpipe handler) in tika-app
+    @PUT
+    @Consumes("multipart/form-data")
+    @Produces("text/plain")
+    @Path("form/main")
+    public StreamingOutput getTextMainFromMultipart(final Attachment att, 
@Context final UriInfo info) {
+        return produceTextMain(att.getObject(InputStream.class), 
att.getHeaders(), info);
+    }
+
+    public StreamingOutput produceTextMain(final InputStream is, @Context 
MultivaluedMap<String, String> httpHeaders, @Context final UriInfo info) {
+        final Parser parser = createParser();
+        final Metadata metadata = new Metadata();
+        final ParseContext context = new ParseContext();
+
+        fillMetadata(parser, metadata, context, httpHeaders);
+        fillParseContext(context, httpHeaders, parser);
+
+        logRequest(LOG, info, metadata);
+
+        return new StreamingOutput() {
+            public void write(OutputStream outputStream) throws IOException, 
WebApplicationException {
+                Writer writer = new OutputStreamWriter(outputStream, UTF_8);
+
+                ContentHandler handler = new BoilerpipeContentHandler(writer);
+
+                try (InputStream inputStream = is) {
+                    parse(parser, LOG, info.getPath(), inputStream, handler, 
metadata, context);
+                }
+            }
+        };
+    }
+
+
     @PUT
     @Consumes("*/*")
     @Produces("text/plain")
diff --git 
a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java 
b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index be5092a..7df879e 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -18,6 +18,7 @@
 package org.apache.tika.server;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
 import javax.ws.rs.core.Response;
@@ -36,7 +37,6 @@ import org.junit.Test;
 
 public class TikaResourceTest extends CXFTestBase {
     public static final String TEST_DOC = "test.doc";
-    public static final String TEST_XLSX = "16637.xlsx";
     public static final String TEST_PASSWORD_PROTECTED = "password.xls";
     private static final String TEST_RECURSIVE_DOC = 
"test_recursive_embedded.docx";
 
@@ -77,6 +77,35 @@ public class TikaResourceTest extends CXFTestBase {
     }
 
     @Test
+    public void testTextMain() throws Exception {
+        //boilerpipe
+        Response response = WebClient.create(endPoint + TIKA_PATH + "/main")
+                .accept("text/plain")
+                .put(ClassLoader.getSystemResourceAsStream("testHTML.html"));
+        String responseMsg = getStringFromInputStream((InputStream) response
+                .getEntity());
+        assertTrue(responseMsg.contains("Title : Test Indexation Html"));
+        assertFalse(responseMsg.contains("Indexation du fichier"));
+    }
+
+    @Test
+    public void testTextMainMultipart() throws Exception {
+        //boilerpipe
+        Attachment attachmentPart =
+                new Attachment("myhtml", "text/html", 
ClassLoader.getSystemResourceAsStream("testHTML.html"));
+
+
+        Response response = WebClient.create(endPoint + TIKA_PATH+"/form/main")
+                .type("multipart/form-data")
+                .accept("text/plain")
+                .put(attachmentPart);
+        String responseMsg = getStringFromInputStream((InputStream) response
+                .getEntity());
+        assertTrue(responseMsg.contains("Title : Test Indexation Html"));
+        assertFalse(responseMsg.contains("Indexation du fichier"));
+    }
+
+    @Test
     public void testApplicationWadl() throws Exception {
         Response response = WebClient
                 .create(endPoint + TIKA_PATH + "?_wadl")
diff --git a/tika-server/src/test/resources/testHTML.html 
b/tika-server/src/test/resources/testHTML.html
new file mode 100644
index 0000000..5bbd4d8
--- /dev/null
+++ b/tika-server/src/test/resources/testHTML.html
@@ -0,0 +1,28 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+       <head>
+        <title>Title : Test Indexation Html</title>
+        <meta name="Author" content="Tika Developers">
+        <meta name="ICBM" content="51.2312, -5.1987">
+        <meta http-equiv="refresh" content="5">
+    </head>
+       <body>
+               <h1><a name="test-anchor"></a>Test Indexation Html</h1>
+               <p><a href="http://www.apache.org/";>Indexation</a> du 
fichier</p>
+       </body>
+</html>

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <commits@tika.apache.org>'].

Reply via email to