updated with changes
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/0dbd69ce Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/0dbd69ce Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/0dbd69ce Branch: refs/heads/master Commit: 0dbd69cef5ec603a11d7f5b52f119e3bea1550b5 Parents: 7ebe007 Author: manali <[email protected]> Authored: Tue Mar 1 00:59:11 2016 -0800 Committer: manali <[email protected]> Committed: Tue Mar 1 00:59:11 2016 -0800 ---------------------------------------------------------------------- .../tika/sax/RichTextContentHandlerTest.java | 75 ++++++++++++++++++++ .../tika/parser/ner/nltk/NLTKNERecogniser.java | 16 +++-- .../parser/ner/nltk/NLTKNERecogniserTest.java | 2 +- 3 files changed, 86 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/0dbd69ce/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java new file mode 100644 index 0000000..257ea38 --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.sax; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.Assert.assertEquals; + +import java.io.ByteArrayOutputStream; +import java.io.OutputStreamWriter; +import java.nio.charset.Charset; + +import org.apache.tika.metadata.Metadata; +import org.junit.Test; +import org.xml.sax.helpers.AttributesImpl; + +/** + * Test cases for the {@link RichTextContentHandler} class. + */ +public class RichTextContentHandlerTest { + + /** + * Test to check img tags are detected and rich text version used. + */ + @Test + public void aTagTest() throws Exception { + ByteArrayOutputStream buffer = new ByteArrayOutputStream(); + + XHTMLContentHandler xhtml = new XHTMLContentHandler( + new RichTextContentHandler( + new OutputStreamWriter(buffer, Charset.defaultCharset())), + new Metadata()); + xhtml.startDocument(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "", "name", "", "value"); + xhtml.startElement("a", attributes); + xhtml.endDocument(); + + assertEquals("\n\n\n\n[bookmark: value]", buffer.toString(UTF_8.name())); + } + + /** + * Test to check a tags are detected and rich text version used. + */ + @Test + public void imgTagTest() throws Exception { + ByteArrayOutputStream buffer = new ByteArrayOutputStream(); + + XHTMLContentHandler xhtml = new XHTMLContentHandler( + new RichTextContentHandler( + new OutputStreamWriter(buffer, Charset.defaultCharset())), + new Metadata()); + xhtml.startDocument(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "", "alt", "", "value"); + xhtml.startElement("img", attributes); + xhtml.endDocument(); + + assertEquals("\n\n\n\n[image: value]", buffer.toString(UTF_8.name())); + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/0dbd69ce/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java index 1edfe28..5407189 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java @@ -22,8 +22,13 @@ import org.json.simple.parser.JSONParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.*; -import java.util.*; +import java.io.IOException; +import java.util.Set; +import java.util.HashSet; +import java.util.Collection; +import java.util.Map; +import java.util.HashMap; +import java.util.Properties; import javax.ws.rs.core.MediaType; import javax.ws.rs.core.Response; @@ -41,6 +46,7 @@ public class NLTKNERecogniser implements NERecogniser { private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class); private static boolean available = false; private static final String NLTK_REST_HOST = "http://localhost:8881"; + private String restHostUrlStr; /** * some common entities identified by NLTK */ @@ -48,7 +54,7 @@ public class NLTKNERecogniser implements NERecogniser { add("NAMES"); }}; - String restHostUrlStr; + public NLTKNERecogniser(){ try { @@ -59,8 +65,7 @@ public class NLTKNERecogniser implements NERecogniser { e.printStackTrace(); } - if (restHostUrlStr == null - || (restHostUrlStr != null && restHostUrlStr.equals(""))) { + if (restHostUrlStr == null || restHostUrlStr.equals("")) { this.restHostUrlStr = NLTK_REST_HOST; } else { this.restHostUrlStr = restHostUrlStr; @@ -115,7 +120,6 @@ public class NLTKNERecogniser implements NERecogniser { public Map<String, Set<String>> recognise(String text) { Map<String, Set<String>> entities = new HashMap<>(); try { - int port = 8881; String url = restHostUrlStr + "/nltk"; Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text); int responseCode = response.getStatus(); http://git-wip-us.apache.org/repos/asf/tika/blob/0dbd69ce/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java index 5c1307f..94d9a27 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java @@ -48,7 +48,7 @@ public class NLTKNERecogniserTest { } else { assertTrue(names.contains("America")); - assertTrue(names.size() == 1); //and nothing else + assertTrue(names.size() == 1); } } }
