Repository: tika
Updated Branches:
  refs/heads/master e2fdcaa7e -> f82702632


mitie ner parser added


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/5f859fbb
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/5f859fbb
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/5f859fbb

Branch: refs/heads/master
Commit: 5f859fbb7684a907db14959a4e49dfa644ba93d6
Parents: d184e9b
Author: manali <[email protected]>
Authored: Mon Apr 11 22:05:30 2016 -0700
Committer: manali <[email protected]>
Committed: Mon Apr 11 22:05:30 2016 -0700

----------------------------------------------------------------------
 .../parser/ner/mitie/MITIENERecogniser.java     | 166 +++++++++++++++++++
 1 file changed, 166 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/5f859fbb/tika-parsers/src/main/java/org/apache/tika/parser/ner/mitie/MITIENERecogniser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/ner/mitie/MITIENERecogniser.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/ner/mitie/MITIENERecogniser.java
new file mode 100644
index 0000000..8f39ded
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/ner/mitie/MITIENERecogniser.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.mitie;
+
+import org.apache.commons.logging.Log;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.parser.ner.NERecogniser;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+import java.util.*;
+
+/**
+ *  This class offers an implementation of {@link NERecogniser} based on
+ *  CRF classifiers from Stanford CoreNLP. This NER requires additional setup,
+ *  due to runtime binding to Stanford CoreNLP.
+ *  See <a href="http://wiki.apache.org/tika/TikaAndNER#CoreNLP";>
+ *      Tika NER Wiki</a> for configuring this recogniser.
+ *  @see NERecogniser
+ *
+ */
+public class MITIENERecogniser implements NERecogniser {
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(MITIENERecogniser.class);
+
+    public static final String MODEL_PROP_NAME = "ner.mitie.model";
+
+    public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
+        add(PERSON);
+        add(LOCATION);
+        add(ORGANIZATION);
+        add("MISC");
+    }};
+
+    private static final String NamedEntityExtractor_Class = 
"edu.mit.ll.mitie.NamedEntityExtractor";
+    private boolean available = false;
+    private Object extractorInstance;
+
+    public MITIENERecogniser(){
+        this(System.getProperty(MODEL_PROP_NAME));
+    }
+
+    /**
+     * Creates a NERecogniser by loading model from given path
+     * @param modelPath path to NER model file
+     */
+    public MITIENERecogniser(String modelPath) {
+        try {
+            LOG.info("model: " + modelPath);
+            Class<?> namedEntityExtractorClass = 
Class.forName(NamedEntityExtractor_Class);
+            extractorInstance = 
namedEntityExtractorClass.getDeclaredConstructor(new 
Class[]{String.class}).newInstance("/Users/manali/cs599_dr/MITIE/MITIE-models/english/ner_model.dat");
+
+            this.available = true;
+        } catch (Exception e) {
+            LOG.warn("{} while trying to load the model from {}", 
e.getMessage(), modelPath);
+        }
+        LOG.info("Available for service ? {}", available);
+    }
+
+    /**
+     *
+     * @return {@code true} if model was available, valid and was able to 
initialise the classifier.
+     * returns {@code false} when this recogniser is not available for service.
+     */
+    public boolean isAvailable() {
+        return available;
+    }
+
+    /**
+     * Gets set of entity types recognised by this recogniser
+     * @return set of entity classes/types
+     */
+    public Set<String> getEntityTypes() {
+        return ENTITY_TYPES;
+    }
+
+    /**
+     * recognises names of entities in the text
+     * @param text text which possibly contains names
+     * @return map of entity type -> set of names
+     */
+    public Map<String, Set<String>> recognise(String text) {
+        Map<String, Set<String>> names = new HashMap<>();
+
+        try {
+            Class<?> stringVectorClass = 
Class.forName("edu.mit.ll.mitie.StringVector");
+            Class<?> entityMentionVectorClass = 
Class.forName("edu.mit.ll.mitie.EntityMentionVector");
+            Class<?> entityMentionClass = 
Class.forName("edu.mit.ll.mitie.EntityMention");
+            Object entityMentionObject = null;
+            Class<?> globalClass = Class.forName("edu.mit.ll.mitie.global");
+            Object stringVectorObject = 
extractorInstance.getClass().getMethod("getPossibleNerTags").invoke(extractorInstance);
+            long size = 
(Long)stringVectorClass.getMethod("size").invoke(stringVectorObject);
+            ArrayList<String> possibleTags = new ArrayList<>();
+            for(long i=0; i<size; i++){
+                String t = (String)stringVectorClass.getMethod("get", 
Integer.TYPE).invoke(stringVectorObject,(int)i);
+                possibleTags.add(t);
+            }
+            Method tokenize = globalClass.getMethod("tokenize", String.class);
+            text = "Hi, my name is Abraham Lincoln. I live in Los Angeles, 
California.";
+            stringVectorObject = tokenize.invoke(globalClass,text );
+
+            ArrayList<String> stringVector = new ArrayList<>();
+            size = 
(Long)stringVectorClass.getMethod("size").invoke(stringVectorObject);
+            for(long i=0; i<size; i++){
+                String t = (String)stringVectorClass.getMethod("get", 
Integer.TYPE).invoke(stringVectorObject,(int)i);
+                stringVector.add(t);
+            }
+            Method extractEntities = 
extractorInstance.getClass().getMethod("extractEntities", stringVectorClass);
+            Object entities = extractEntities.invoke(extractorInstance, 
stringVectorObject);
+            size = 
(Long)entityMentionVectorClass.getMethod("size").invoke(entities);
+            for(long i=0; i<size; i++){
+                entityMentionObject = 
entityMentionVectorClass.getMethod("get", Integer.TYPE).invoke(entities, 
(int)i);
+                int tag_index = 
(Integer)entityMentionClass.getMethod("getTag").invoke(entityMentionObject);
+                String tag = possibleTags.get(tag_index);
+
+                Set x = new HashSet<>();
+
+                if(names.containsKey(tag)){
+                    x = names.get(tag);
+                }
+                else{
+                    names.put(tag,x);
+                }
+
+                int start = 
(Integer)entityMentionClass.getMethod("getStart").invoke(entityMentionObject);
+                int end = 
(Integer)entityMentionClass.getMethod("getEnd").invoke(entityMentionObject);
+
+                String match = "";
+                for(;start<end; start++){
+                    match += stringVector.get(start);
+                }
+                x.add(match.trim());
+            }
+
+            for(Map.Entry<String, Set<String>> entry : names.entrySet()) {
+                LOG.info(entry.getKey()+ "\t" +entry.getValue().toString());
+            }
+
+
+        } catch (Exception e) {
+            e.printStackTrace();
+            LOG.debug(e.getMessage(), e);
+        }
+        return names;
+    }
+    
+}

Reply via email to