created NLTK host server properties
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ac4c0b2c Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ac4c0b2c Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ac4c0b2c Branch: refs/heads/master Commit: ac4c0b2c9321bba395b92214a3504d8346c3e936 Parents: f054bcd Author: manali <[email protected]> Authored: Wed Feb 24 22:23:26 2016 -0800 Committer: manali <[email protected]> Committed: Wed Feb 24 22:23:26 2016 -0800 ---------------------------------------------------------------------- tika-parsers/pom.xml | 15 ++-- .../tika/parser/ner/nltk/NLTKNERecogniser.java | 72 ++++++++++---------- .../tika/parser/ner/nltk/NLTKServer.properties | 16 +++++ .../parser/ner/nltk/NLTKNERecogniserTest.java | 8 +-- 4 files changed, 63 insertions(+), 48 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml index 41daf4d..088a6e9 100644 --- a/tika-parsers/pom.xml +++ b/tika-parsers/pom.xml @@ -88,13 +88,6 @@ <version>2.1.1</version> </dependency> - - <!-- manali added this--> - <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>httpclient</artifactId> - <version>4.5.1</version> - </dependency> <!-- Optional OSGi dependencies, used only when running within OSGi --> <dependency> <groupId>org.apache.felix</groupId> @@ -366,6 +359,14 @@ <version>3.2.2</version> <scope>provided</scope> </dependency> + + <!--Jackson parse String to JSON--> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-core</artifactId> + <version>2.7.1</version> + </dependency> + </dependencies> <build> http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java index 99cde6f..eddddcb 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniser.java @@ -16,38 +16,18 @@ */ package org.apache.tika.parser.ner.nltk; -import org.apache.http.client.methods.HttpGet; import org.apache.tika.parser.ner.NERecogniser; -import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import java.io.BufferedReader; -import java.io.InputStreamReader; -import java.util.ArrayList; -import org.apache.http.HttpResponse; -import org.apache.http.NameValuePair; -import org.apache.http.client.HttpClient; -import org.apache.http.client.entity.UrlEncodedFormEntity; -import org.apache.http.client.methods.HttpPost; -import org.apache.http.impl.client.HttpClientBuilder; -import org.apache.http.message.BasicNameValuePair; - -import javax.ws.rs.core.Form; +import java.io.*; +import java.util.*; import javax.ws.rs.core.MediaType; import javax.ws.rs.core.Response; import org.apache.cxf.jaxrs.client.WebClient; -import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition; -import org.apache.cxf.jaxrs.ext.multipart.MultipartBody; /** * This class offers an implementation of {@link NERecogniser} based on @@ -59,9 +39,8 @@ import org.apache.cxf.jaxrs.ext.multipart.MultipartBody; public class NLTKNERecogniser implements NERecogniser { private static final Logger LOG = LoggerFactory.getLogger(NLTKNERecogniser.class); - private final static String USER_AGENT = "Mozilla/5.0"; private static boolean available = false; - + private static final String NLTK_REST_HOST = "http://localhost:8881"; /** * some common entities identified by NLTK */ @@ -75,12 +54,31 @@ public class NLTKNERecogniser implements NERecogniser { add(DATE); add(FACILITY); add(GPE); + add("NAMES"); }}; + String restHostUrlStr; public NLTKNERecogniser(){ try { - String url = "http://localhost:5000/"; - Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).get(); + + String restHostUrlStr=""; + try { + restHostUrlStr = readRestUrl(); + } catch (IOException e) { + e.printStackTrace(); + } + + if (restHostUrlStr == null + || (restHostUrlStr != null && restHostUrlStr.equals(""))) { + this.restHostUrlStr = NLTK_REST_HOST; + } else { + this.restHostUrlStr = restHostUrlStr; + } + + + + + Response response = WebClient.create(restHostUrlStr).accept(MediaType.TEXT_HTML).get(); int responseCode = response.getStatus(); if(responseCode == 200){ available = true; @@ -94,6 +92,13 @@ public class NLTKNERecogniser implements NERecogniser { } } + private static String readRestUrl() throws IOException { + Properties nltkProperties = new Properties(); + nltkProperties.load(NLTKNERecogniser.class + .getResourceAsStream("NLTKServer.properties")); + + return nltkProperties.getProperty("nltk.server.url"); + } /** * @return {@code true} if server endpoint is available. @@ -119,22 +124,15 @@ public class NLTKNERecogniser implements NERecogniser { public Map<String, Set<String>> recognise(String text) { Map<String, Set<String>> entities = new HashMap<>(); try { - String url = "http://localhost:5000/nltk"; - Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).form(new Form().param("text",text)); + int port = 8881; + String url = restHostUrlStr + "/nltk"; + Response response = WebClient.create(url).accept(MediaType.TEXT_HTML).post(text); int responseCode = response.getStatus(); if (responseCode == 200) { String result = response.readEntity(String.class); JSONParser parser = new JSONParser(); JSONObject j = (JSONObject) parser.parse(result); - JSONArray aa = new JSONArray(); - for (Object x : j.keySet()) { - aa = (JSONArray) j.get(x.toString()); - Set s = new HashSet(); - for (Object y : aa) { - s.add(y.toString()); - } - entities.put(x.toString(), s); - } + Set s = entities.put("NAMES", new HashSet((Collection) j.get("names"))); } } catch (Exception e) { http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties new file mode 100644 index 0000000..24f5a2e --- /dev/null +++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +nltk.server.url=http://localhost:5000 http://git-wip-us.apache.org/repos/asf/tika/blob/ac4c0b2c/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java index 2861051..a40ec24 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/ner/nltk/NLTKNERecogniserTest.java @@ -42,13 +42,13 @@ public class NLTKNERecogniserTest { Metadata md = new Metadata(); tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md); - Set<String> gpe = new HashSet<>(Arrays.asList(md.getValues("NER_GPE"))); - if(gpe.size() == 0) { + Set<String> names = new HashSet<>(Arrays.asList(md.getValues("NER_NAMES"))); + if(names.size() == 0) { return; } else { - assertTrue(gpe.contains("America")); - assertTrue(gpe.size() == 1); //and nothing else + assertTrue(names.contains("America")); + assertTrue(names.size() == 1); //and nothing else } } }
