Repository: tika
Updated Branches:
  refs/heads/master e0ca3b5df -> a35320069


Grobid NER


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7e2c089c
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7e2c089c
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7e2c089c

Branch: refs/heads/master
Commit: 7e2c089c8e3f01fd60473517f64ed20887584527
Parents: d184e9b
Author: AravindRam <[email protected]>
Authored: Mon Apr 11 00:48:22 2016 -0700
Committer: AravindRam <[email protected]>
Committed: Mon Apr 11 00:48:22 2016 -0700

----------------------------------------------------------------------
 .../parser/ner/grobid/GrobidNERecogniser.java   | 223 +++++++++++++++++++
 .../parser/ner/grobid/GrobidServer.properties   |  17 ++
 .../ner/grobid/GrobidNERecogniserTest.java      |  66 ++++++
 3 files changed, 306 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/7e2c089c/tika-parsers/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java
new file mode 100644
index 0000000..e4d7152
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniser.java
@@ -0,0 +1,223 @@
+package org.apache.tika.parser.ner.grobid;
+
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.tika.parser.ner.NERecogniser;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+import java.io.IOException;
+import java.net.URLEncoder;
+import java.util.HashSet;
+import java.util.HashMap;
+import java.util.Properties;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.client.WebClient;
+
+public class GrobidNERecogniser implements NERecogniser{
+
+       private static final Logger LOG = 
LoggerFactory.getLogger(GrobidNERecogniser.class);
+    private static boolean available = false;
+    private static final String GROBID_REST_HOST = "http://localhost:8080";;
+    private String restHostUrlStr;
+       
+    /*
+     * Useful Entities from Grobid NER 
+     */
+    public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
+        add("MEASUREMENT_NUMBERS");
+        add("MEASUREMENT_UNITS");
+        add("MEASUREMENTS");
+        add("NORMALIZED_MEASUREMENTS");
+    }};
+
+
+    public GrobidNERecogniser(){
+        try {
+
+                   String restHostUrlStr="";
+                   try {
+                       restHostUrlStr = readRestUrl();
+                   } catch (IOException e) {
+                       e.printStackTrace();
+                   }
+
+                   if (restHostUrlStr == null || restHostUrlStr.equals("")) {
+                       this.restHostUrlStr = GROBID_REST_HOST;
+                   } else {
+                       this.restHostUrlStr = restHostUrlStr;
+                   }
+
+                   Response response = 
WebClient.create(restHostUrlStr).accept(MediaType.APPLICATION_JSON).get();
+                   int responseCode = response.getStatus();
+                   if(responseCode == 200){
+                       available = true;
+                   }
+                   else{
+                       LOG.info("Grobid REST Server is not running");
+                   }
+       
+               }
+               catch (Exception e) {
+               LOG.info(e.getMessage(), e);
+               }
+    }
+
+    /**
+     * Reads the GROBID REST URL from the properties file
+     * returns the GROBID REST URL
+     */
+    private static String readRestUrl() throws IOException {
+       Properties grobidProperties = new Properties();
+        
grobidProperties.load(GrobidNERecogniser.class.getResourceAsStream("GrobidServer.properties"));
+        return grobidProperties.getProperty("grobid.server.url");
+    }
+    
+    /**
+     * Reads the GROBID REST Endpoint from the properties file
+     * returns the GROBID REST Endpoint
+     */
+    private static String readRestEndpoint() throws IOException {
+       Properties grobidProperties = new Properties();
+        
grobidProperties.load(GrobidNERecogniser.class.getResourceAsStream("GrobidServer.properties"));
+        return grobidProperties.getProperty("grobid.endpoint.text");
+    }
+
+    /**
+     * @return {@code true} if server endpoint is available.
+     * returns {@code false} if server endpoint is not avaliable for service.
+     */
+    public boolean isAvailable() {
+        return available;
+    }
+
+    /**
+     * Gets set of entity types recognised by this recogniser
+     * @return set of entity classes/types
+     */
+    public Set<String> getEntityTypes() {
+        return ENTITY_TYPES;
+    }
+
+    /**
+     * Converts JSON Object to JSON Array 
+     * @return a JSON array
+     */
+    public JSONArray convertToJSONArray(JSONObject obj, String key){
+       JSONArray jsonArray = new JSONArray();
+       try{
+               jsonArray = (JSONArray) obj.get(key);
+       }
+       catch(Exception e){
+               LOG.info(e.getMessage(), e);
+        }
+       return jsonArray;
+    }
+    
+    /**
+     * Parses a JSON String and converts it to a JSON Object 
+     * @return a JSON Object
+     */
+    public JSONObject convertToJSONObject(String jsonString){
+       JSONParser parser = new JSONParser();
+       JSONObject jsonObject = new JSONObject();
+       try{
+               jsonObject = (JSONObject) parser.parse(jsonString);
+       }
+       catch(Exception e){
+               LOG.info(e.getMessage(), e);
+        }
+               return jsonObject;
+    }
+    /**
+     * recognises names of entities in the text
+     * @param text text which possibly contains names
+     * @return map of entity type -> set of names
+     */
+    public Map<String, Set<String>> recognise(String text) {
+       
+       Map<String, Set<String>> entities = new HashMap<String,Set<String>>();
+        Set<String> measurementNumberSet = new HashSet<String>();
+        Set<String> unitSet = new HashSet<String>();
+        Set<String> measurementSet = new HashSet<String>();
+        Set<String> normalizedMeasurementSet = new HashSet<String>();
+        
+        try {
+            String url = restHostUrlStr + readRestEndpoint()+ "?text=" + 
URLEncoder.encode(text,"UTF-8");
+            Response response = 
WebClient.create(url).accept(MediaType.APPLICATION_JSON).get();
+            int responseCode = response.getStatus();
+    
+            if (responseCode == 200) {
+                String result = response.readEntity(String.class);
+                JSONObject jsonObject = convertToJSONObject(result);
+                JSONArray measurements = convertToJSONArray(jsonObject, 
"measurements");
+                for(int i=0; i<measurements.size(); i++){
+                       
+                       StringBuffer measurementString = new StringBuffer();
+                       StringBuffer normalizedMeasurementString = new 
StringBuffer();
+                       
+                       JSONObject quantity = (JSONObject) 
convertToJSONObject(measurements.get(i).toString()).get("quantity");
+                       
+                       if(quantity.containsKey("rawValue")){
+                               String measurementNumber = (String) 
convertToJSONObject(quantity.toString()).get("rawValue");
+                               measurementString.append(measurementNumber);
+                       measurementString.append(" ");
+                       measurementNumberSet.add(measurementNumber);
+                       }
+                       
+                       if(quantity.containsKey("normalizedQuantity")){
+                               Long normalizedMeasurementNumber = (Long) 
convertToJSONObject(quantity.toString()).get("normalizedQuantity");
+                               
normalizedMeasurementString.append(normalizedMeasurementNumber.toString());
+                       normalizedMeasurementString.append(" ");
+                       }
+                       
+                       JSONObject jsonObj = (JSONObject) 
convertToJSONObject(quantity.toString());
+                       
+                       if(jsonObj.containsKey("rawUnit")){
+                               JSONObject rawUnit = (JSONObject) 
jsonObj.get("rawUnit");
+                               String unitName = (String) 
convertToJSONObject(rawUnit.toString()).get("name");
+                               unitSet.add(unitName);
+                               measurementString.append(unitName);
+                       }
+                       
+                       if(jsonObj.containsKey("normalizedUnit")){
+                               JSONObject normalizedUnit = (JSONObject) 
jsonObj.get("normalizedUnit");
+                               String normalizedUnitName = (String) 
convertToJSONObject(normalizedUnit.toString()).get("name");
+                               
normalizedMeasurementString.append(normalizedUnitName);
+                       }
+                       
+                       if(!measurementString.toString().equals("")){
+                       measurementSet.add(measurementString.toString());
+                       }
+                       
+                       if(!normalizedMeasurementString.toString().equals("")){
+                               
normalizedMeasurementSet.add(normalizedMeasurementString.toString());
+                       }
+                       
+                }
+                entities.put("MEASUREMENT_NUMBERS",measurementNumberSet);
+                entities.put("MEASUREMENT_UNITS",unitSet); 
+                entities.put("MEASUREMENTS",measurementSet);
+                
entities.put("NORMALIZED_MEASUREMENTS",normalizedMeasurementSet);
+                
+            }
+        }
+        catch (Exception e) {
+            LOG.info(e.getMessage(), e);
+            
+        }
+        ENTITY_TYPES.clear();
+        ENTITY_TYPES.addAll(entities.keySet());
+        return entities;
+    }
+}
+       
+
+       
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/7e2c089c/tika-parsers/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties
 
b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties
new file mode 100644
index 0000000..a7718ab
--- /dev/null
+++ 
b/tika-parsers/src/main/resources/org/apache/tika/parser/ner/grobid/GrobidServer.properties
@@ -0,0 +1,17 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+grobid.server.url=http://localhost:8080
+grobid.endpoint.text=/processQuantityText
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/7e2c089c/tika-parsers/src/test/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniserTest.java
new file mode 100644
index 0000000..60279e9
--- /dev/null
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/ner/grobid/GrobidNERecogniserTest.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright 
owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.ner.grobid;
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.NamedEntityParser;
+import org.junit.Test;
+
+/**
+*Test case for {@link Grobid NER}
+*/
+public class GrobidNERecogniserTest {
+               
+               @Test
+           public void testGetEntityTypes() throws Exception {
+               String text = "I've lost one minute.";
+               System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, 
GrobidNERecogniser.class.getName());
+               Tika tika = new Tika(new 
TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
+               Metadata md = new Metadata();
+               tika.parse(new 
ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
+               
+               HashSet<String> set = new HashSet<String>();
+               
+               set.clear();
+               
set.addAll(Arrays.asList(md.getValues("NER_MEASUREMENT_NUMBERS")));
+               assertTrue(set.contains("one"));
+
+               set.clear();
+               
set.addAll(Arrays.asList(md.getValues("NER_MEASUREMENT_UNITS")));
+               assertTrue(set.contains("minute"));
+
+               set.clear();
+               set.addAll(Arrays.asList(md.getValues("NER_MEASUREMENTS")));
+               assertTrue(set.contains("one minute"));
+
+               set.clear();
+               
set.addAll(Arrays.asList(md.getValues("NER_NORMALIZED_MEASUREMENTS")));
+               assertTrue(set.contains("60 s"));
+         }
+}
+
+

Reply via email to