fix for TIKA-1876 contributed by manalishah
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/c809690e Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/c809690e Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/c809690e Branch: refs/heads/master Commit: c809690ec87ffa600018dbc5eee6d6756645adb0 Parents: ed762b7 Author: manali <[email protected]> Authored: Fri Feb 26 19:58:06 2016 -0800 Committer: manali <[email protected]> Committed: Fri Feb 26 19:58:06 2016 -0800 ---------------------------------------------------------------------- .gitignore | 2 +- .../tika/parser/ner/nltk/NLTKNERecogniser.java | 137 +++++++++++++++++++ .../tika/parser/ner/nltk/NLTKServer.properties | 16 +++ .../parser/ner/nltk/NLTKNERecogniserTest.java | 54 ++++++++ 4 files changed, 208 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/.gitignore ---------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore index c262c68..8093709 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,4 @@ target *.iws *.bin nbactions.xml -nb-configuration.xml \ No newline at end of file +nb-configuration.xml*.DS_Store http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java new file mode 100644 index 0000000..850f4dd --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ner.nltk; + +import org.apache.tika.parser.ner.NERecogniser; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.util.*; +import javax.ws.rs.core.MediaType; +import javax.ws.rs.core.Response; + +import org.apache.cxf.jaxrs.client.WebClient; + +/** + * This class offers an implementation of {@link NERecogniser} based on + * ne_chunk() module of NLTK. This NER requires additional setup, + * due to Http requests to an endpoint server that runs NLTK. + * This endpoint has been implemented as pip/setuptools installable python module + * See <a href="https://github.com/manalishah/NLTKRest"></a> + * See <a href="http://wiki.apache.org/tika/TikaAndNLTK"> + * + */ +public class NLTKNERecogniser implements NERecogniser { + + private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class); + private static boolean available = false; + private static final String NLTK_REST_HOST = "http://localhost:8881"; + /** + * some common entities identified by NLTK + */ + public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{ + add("NAMES"); + }}; + + String restHostUrlStr; + public NLTKNERecogniser(){ + try { + + String restHostUrlStr=""; + try { + restHostUrlStr = readRestUrl(); + } catch (IOException e) { + e.printStackTrace(); + } + + if (restHostUrlStr == null + || (restHostUrlStr != null && restHostUrlStr.equals(""))) { + this.restHostUrlStr = NLTK_REST_HOST; + } else { + this.restHostUrlStr = restHostUrlStr; + } + //check if nltkrest is running + Response response = WebClient.create(restHostUrlStr).accept(MediaType.TEXT_HTML).get(); + int responseCode = response.getStatus(); + if(responseCode == 200){ + available = true; + } + else{ + LOG.info("NLTKRest Server is not running"); + } + + } catch (Exception e) { + LOG.debug(e.getMessage(), e); + } + } + + private static String readRestUrl() throws IOException { + Properties nltkProperties = new Properties(); + nltkProperties.load(NLTKNERecogniser.class + .getResourceAsStream("NLTKServer.properties")); + + return nltkProperties.getProperty("nltk.server.url"); + } + + /** + * @return {@code true} if server endpoint is available. + * returns {@code false} if server endpoint is not avaliable for service. + */ + public boolean isAvailable() { + return available; + } + + /** + * Gets set of entity types recognised by this recogniser + * @return set of entity classes/types + */ + public Set<String> getEntityTypes() { + return ENTITY_TYPES; + } + + /** + * recognises names of entities in the text + * @param text text which possibly contains names + * @return map of entity type -> set of names + */ + public Map<String, Set<String>> recognise(String text) { + Map<String, Set<String>> entities = new HashMap<>(); + try { + int port = 8881; + String url = restHostUrlStr + "/nltk"; + Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text); + int responseCode = response.getStatus(); + if (responseCode == 200) { + String result = response.readEntity(String.class); + JSONParser parser = new JSONParser(); + JSONObject j = (JSONObject) parser.parse(result); + Set s = entities.put("NAMES", new HashSet((Collection) j.get("names"))); + } + } + catch (Exception e) { + LOG.debug(e.getMessage(), e); + } + ENTITY_TYPES.clear(); + ENTITY_TYPES.addAll(entities.keySet()); + return entities; + } + + +} http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties new file mode 100644 index 0000000..5909b69 --- /dev/null +++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +nltk.server.url=http://localhost:8881 http://git-wip-us.apache.org/repos/asf/tika/blob/c809690e/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java new file mode 100644 index 0000000..5c1307f --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright owlocationNameEntitieship. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ner.nltk; + +import org.apache.commons.logging.Log; +import org.apache.tika.Tika; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ner.NamedEntityParser; +import org.junit.Ignore; +import org.junit.Test; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.assertTrue; + +public class NLTKNERecogniserTest { + @Test + public void testGetEntityTypes() throws Exception { + String text = "America is a big country."; + System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName()); + Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml"))); + Metadata md = new Metadata(); + tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md); + + Set<String> names = new HashSet<>(Arrays.asList(md.getValues("NER_NAMES"))); + if(names.size() == 0) { + return; + } + else { + assertTrue(names.contains("America")); + assertTrue(names.size() == 1); //and nothing else + } + } +}
