Repository: tika Updated Branches: refs/heads/master 7c245fa87 -> 9056894da
Added NLTK NER Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d685742c Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d685742c Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d685742c Branch: refs/heads/master Commit: d685742c6e81b9153ce881f9622292104a4144d2 Parents: 6a09233 Author: manali <[email protected]> Authored: Tue Feb 2 00:12:28 2016 -0800 Committer: manali <[email protected]> Committed: Tue Feb 2 00:12:28 2016 -0800 ---------------------------------------------------------------------- .gitignore | 3 +- tika-parsers/pom.xml | 7 + .../tika/parser/ner/nltk/NLTKNERecogniser.java | 161 +++++++++++++++++++ .../parser/ner/nltk/NLTKNERecogniserTest.java | 40 +++++ 4 files changed, 210 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/d685742c/.gitignore ---------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore index c262c68..40c895f 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,5 @@ target *.iws *.bin nbactions.xml -nb-configuration.xml \ No newline at end of file +nb-configuration.xml +*.DS_Store http://git-wip-us.apache.org/repos/asf/tika/blob/d685742c/tika-parsers/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml index 57497ec..8d330c3 100644 --- a/tika-parsers/pom.xml +++ b/tika-parsers/pom.xml @@ -88,6 +88,13 @@ <version>2.1.1</version> </dependency> + + <!-- manali added this--> + <dependency> + <groupId>org.apache.httpcomponents</groupId> + <artifactId>httpclient</artifactId> + <version>4.5.1</version> + </dependency> <!-- Optional OSGi dependencies, used only when running within OSGi --> <dependency> <groupId>org.apache.felix</groupId> http://git-wip-us.apache.org/repos/asf/tika/blob/d685742c/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java new file mode 100644 index 0000000..eb216ea --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.ner.nltk; + +import org.apache.http.client.methods.HttpGet; +import org.apache.tika.parser.ner.NERecogniser; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.util.ArrayList; +import org.apache.http.HttpResponse; +import org.apache.http.NameValuePair; +import org.apache.http.client.HttpClient; +import org.apache.http.client.entity.UrlEncodedFormEntity; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.http.message.BasicNameValuePair; + + +/** + * This class offers an implementation of {@link NERecogniser} based on + * CRF classifiers from Stanford CoreNLP. This NER requires additional setup, + * due to runtime binding to Stanford CoreNLP. + * See <a href="http://wiki.apache.org/tika/TikaAndNER#NLTK"> + * Tika NER Wiki</a> for configuring this recogniser. + * @see NERecogniser + * + */ +public class NLTKNERecogniser implements NERecogniser { + + private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class); + private final static String USER_AGENT = "Mozilla/5.0"; + private static boolean available = false; + public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{ + add(PERSON); + add(TIME); + add(LOCATION); + add(ORGANIZATION); + add(MONEY); + add(PERCENT); + add(DATE); + add(FACILITY); + add(GPE); + }}; + + public NLTKNERecogniser(){ + try { + + String url = "http://localhost:5000/"; + HttpClient client = HttpClientBuilder.create().build(); + HttpGet get = new HttpGet(url); + + // add header + get.setHeader("User-Agent", USER_AGENT); + HttpResponse response = client.execute(get); + int responseCode = response.getStatusLine().getStatusCode(); + if(responseCode == 200){ + available = true; + } + else{ + LOG.info("NLTKRest Server is not running"); + } + + } catch (Exception e) { + LOG.debug(e.getMessage(), e); + } + } + + + /** + * + * @return {@code true} if model was available, valid and was able to initialise the classifier. + * returns {@code false} when this recogniser is not available for service. + */ + public boolean isAvailable() { + return available; + } + + /** + * Gets set of entity types recognised by this recogniser + * @return set of entity classes/types + */ + public Set<String> getEntityTypes() { + return ENTITY_TYPES; + } + + /** + * recognises names of entities in the text + * @param text text which possibly contains names + * @return map of entity type -> set of names + */ + public Map<String, Set<String>> recognise(String text) { + Map<String, Set<String>> entities = new HashMap<>(); + try { + String url = "http://localhost:5000/nltk"; + HttpClient client = HttpClientBuilder.create().build(); + HttpPost post = new HttpPost(url); + // add header + post.setHeader("User-Agent", USER_AGENT); + List<NameValuePair> urlParameters = new ArrayList<NameValuePair>(); + urlParameters.add(new BasicNameValuePair("text", text)); + post.setEntity(new UrlEncodedFormEntity(urlParameters)); + + HttpResponse response = client.execute(post); + + int responseCode = response.getStatusLine().getStatusCode(); + if (responseCode == 200) { + BufferedReader rd = new BufferedReader( + new InputStreamReader(response.getEntity().getContent())); + + String result = rd.readLine(); + + JSONParser parser = new JSONParser(); + JSONObject j = (JSONObject) parser.parse(result); + JSONArray aa = new JSONArray(); + for (Object x : j.keySet()) { + aa = (JSONArray) j.get(x.toString()); + Set s = new HashSet(); + for (Object y : aa) { + s.add(y.toString()); + } + entities.put(x.toString(), s); + } + } + } + catch (Exception e) { + LOG.debug(e.getMessage(), e); + } + ENTITY_TYPES.clear(); + ENTITY_TYPES.addAll(entities.keySet()); + LOG.info("returning this:" + entities.keySet().toString()); + return entities; + } + + +} http://git-wip-us.apache.org/repos/asf/tika/blob/d685742c/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java new file mode 100644 index 0000000..4fbeb42 --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java @@ -0,0 +1,40 @@ +package org.apache.tika.parser.ner.nltk; + +/** + * Created by manali on 2/1/16. + */ +import org.apache.commons.logging.Log; +import org.apache.tika.Tika; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ner.NamedEntityParser; +import org.junit.Test; + +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.assertTrue; + +public class NLTKNERecogniserTest { + @Test + public void testGetEntityTypes() throws Exception { + + String text = "America"; + System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, NLTKNERecogniser.class.getName()); + + Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml"))); + Metadata md = new Metadata(); + tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md); + + + Set<String> gpe = new HashSet<>(Arrays.asList(md.getValues("NER_GPE"))); + if(gpe.size() == 0) return; + else { + assertTrue(gpe.contains("America")); + assertTrue(gpe.size() == 1); //and nothing else + } + } +}
