Author: joern
Date: Wed May 25 13:44:45 2016
New Revision: 1745485

URL: http://svn.apache.org/viewvc?rev=1745485&view=rev
Log:
OPENNLP-850 Add ner brat annotation service

Added:
    opennlp/sandbox/opennlp-brat-annotator/
    opennlp/sandbox/opennlp-brat-annotator/pom.xml   (with props)
    opennlp/sandbox/opennlp-brat-annotator/src/
    opennlp/sandbox/opennlp-brat-annotator/src/main/
    opennlp/sandbox/opennlp-brat-annotator/src/main/java/
    opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/
    opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/
    
opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratAnnService.java
    
opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratNameFinderResource.java
    opennlp/sandbox/opennlp-brat-annotator/src/test/
    opennlp/sandbox/opennlp-brat-annotator/src/test/java/
    opennlp/sandbox/opennlp-brat-annotator/src/test/java/opennlp/
    opennlp/sandbox/opennlp-brat-annotator/src/test/java/opennlp/bratannotator/
Modified:
    opennlp/sandbox/mallet-addon/params/crf-params.txt
    opennlp/sandbox/mallet-addon/params/maxent-params.txt
    opennlp/sandbox/mallet-addon/pom.xml
    
opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
    
opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java

Modified: opennlp/sandbox/mallet-addon/params/crf-params.txt
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/mallet-addon/params/crf-params.txt?rev=1745485&r1=1745484&r2=1745485&view=diff
==============================================================================
--- opennlp/sandbox/mallet-addon/params/crf-params.txt (original)
+++ opennlp/sandbox/mallet-addon/params/crf-params.txt Wed May 25 13:44:45 2016
@@ -15,6 +15,6 @@
 
 # Sample machine learning properties file
 Algorithm=opennlp.addons.mallet.CRFTrainer
-Cutoff=0
+Cutoff=2
 Iterations=100
 

Modified: opennlp/sandbox/mallet-addon/params/maxent-params.txt
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/mallet-addon/params/maxent-params.txt?rev=1745485&r1=1745484&r2=1745485&view=diff
==============================================================================
--- opennlp/sandbox/mallet-addon/params/maxent-params.txt (original)
+++ opennlp/sandbox/mallet-addon/params/maxent-params.txt Wed May 25 13:44:45 
2016
@@ -14,7 +14,8 @@
 # limitations under the License.
 
 # Sample machine learning properties file
-Algorithm=opennlp.addons.mallet.MaxentTrainer
+#Algorithm=opennlp.addons.mallet.MaxentTrainer
+Algorithm=PERCEPTRON
 Cutoff=0
 Iterations=100
-
+#BeamSize=5

Modified: opennlp/sandbox/mallet-addon/pom.xml
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/mallet-addon/pom.xml?rev=1745485&r1=1745484&r2=1745485&view=diff
==============================================================================
Binary files - no diff available.

Modified: 
opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java?rev=1745485&r1=1745484&r2=1745485&view=diff
==============================================================================
--- 
opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
 (original)
+++ 
opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java
 Wed May 25 13:44:45 2016
@@ -127,21 +127,20 @@ public class CRFTrainer extends Abstract
     // CRFOptimizableBy* objects (terms in the objective function)
     // objective 1: label likelihood objective
 
-    CRFTrainerByLabelLikelihood crfTrainer = new CRFTrainerByLabelLikelihood(
-        crf);
-    crfTrainer.setGaussianPriorVariance(1.0);
-
-//    CRFOptimizableByLabelLikelihood optLabel = new
-//        CRFOptimizableByLabelLikelihood(crf, trainingData);
-//
+//    CRFTrainerByLabelLikelihood crfTrainer = new 
CRFTrainerByLabelLikelihood(crf);
+//    crfTrainer.setGaussianPriorVariance(1.0);
+
+    CRFOptimizableByLabelLikelihood optLabel = new
+        CRFOptimizableByLabelLikelihood(crf, trainingData);
+
 //    // CRF trainer
-//     Optimizable.ByGradientValue[] opts = new Optimizable.ByGradientValue[] {
-//     optLabel };
+     Optimizable.ByGradientValue[] opts = new Optimizable.ByGradientValue[] {
+     optLabel };
 
-    // by default, use L-BFGS as the optimizer
-//     CRFTrainerByValueGradients crfTrainer = new CRFTrainerByValueGradients(
-//     crf, opts);
-//     crfTrainer.setMaxResets(0);
+//     by default, use L-BFGS as the optimizer
+     CRFTrainerByValueGradients crfTrainer = new CRFTrainerByValueGradients(
+     crf, opts);
+     crfTrainer.setMaxResets(0);
 
     // SNIP
 

Modified: 
opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java?rev=1745485&r1=1745484&r2=1745485&view=diff
==============================================================================
--- 
opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java
 (original)
+++ 
opennlp/sandbox/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java
 Wed May 25 13:44:45 2016
@@ -27,8 +27,17 @@ import java.util.Map;
 import opennlp.tools.ml.AbstractEventTrainer;
 import opennlp.tools.ml.model.DataIndexer;
 import opennlp.tools.ml.model.MaxentModel;
+import cc.mallet.classify.C45Trainer;
 import cc.mallet.classify.Classifier;
+import cc.mallet.classify.MaxEntGETrainer;
+import cc.mallet.classify.MaxEntL1Trainer;
+import cc.mallet.classify.MaxEntPRTrainer;
 import cc.mallet.classify.MaxEntTrainer;
+import cc.mallet.classify.NaiveBayes;
+import cc.mallet.classify.NaiveBayesEMTrainer;
+import cc.mallet.classify.NaiveBayesTrainer;
+import cc.mallet.optimize.LimitedMemoryBFGS;
+import cc.mallet.optimize.Optimizer;
 import cc.mallet.types.Alphabet;
 import cc.mallet.types.FeatureVector;
 import cc.mallet.types.Instance;
@@ -67,22 +76,21 @@ public class MaxentTrainer extends Abstr
         weights[featureIndex] = indexer.getNumTimesEventsSeen()[contextIndex];
       }
 
-      FeatureVector fv = new FeatureVector(dataAlphabet, malletFeatures,
-          weights);
+      FeatureVector fv = new FeatureVector(dataAlphabet, malletFeatures, 
weights);
       Instance inst = new Instance(fv, targetAlphabet.lookupLabel(
-          indexer.getOutcomeLabels()[outcomes[contextIndex]], true), "name",
+          indexer.getOutcomeLabels()[outcomes[contextIndex]], true), "fid:" + 
contextIndex,
           "data-indexer");
       instances.add(inst);
     }
 
     InstanceList trainingData = new InstanceList(dataAlphabet, targetAlphabet);
-    Instance inst = instances.iterator().next();
-
-    Alphabet.alphabetsMatch(trainingData, inst);
+    
     trainingData.addAll(instances);
 
     MaxEntTrainer trainer = new MaxEntTrainer();
-    
+//    trainer.setGaussianPriorVariance(1d);
+//    trainer.setNumIterations(100);
+
     Classifier classifier = trainer.train(trainingData);
 
     return new ClassifierModel(classifier);

Added: opennlp/sandbox/opennlp-brat-annotator/pom.xml
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-brat-annotator/pom.xml?rev=1745485&view=auto
==============================================================================
Binary file - no diff available.

Propchange: opennlp/sandbox/opennlp-brat-annotator/pom.xml
------------------------------------------------------------------------------
    svn:mime-type = application/xml

Added: 
opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratAnnService.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratAnnService.java?rev=1745485&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratAnnService.java
 (added)
+++ 
opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratAnnService.java
 Wed May 25 13:44:45 2016
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.bratannotator;
+
+import java.io.File;
+import java.net.URI;
+import java.net.URL;
+
+import org.eclipse.jetty.server.Server;
+import org.eclipse.jetty.servlet.ServletContextHandler;
+import org.eclipse.jetty.servlet.ServletHolder;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.sentdetect.NewlineSentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+
+public class BratAnnService {
+  
+  public static SentenceDetector sentenceDetector;
+  public static Tokenizer tokenizer;
+  public static TokenNameFinder nameFinders[];
+  
+  public static void main(String[] args) throws Exception {
+    
+    if (args.length < 3) {
+      System.out.println("sentenceDetectorURI tokenizerURI namefinderURI_1 ... 
nameFinderURI_n");
+      return;
+    }
+
+    URI sentenceDetectorUri = URI.create(args[0]);
+    if ("sentenceDetector".equals(sentenceDetectorUri.getScheme())) {
+      
+      if ("newline".equals(sentenceDetectorUri.getSchemeSpecificPart())) {
+        sentenceDetector = new NewlineSentenceDetector();
+      }
+      else {
+        System.out.println("unkown sentence detector");
+        return;
+      }
+    }
+    else {
+      sentenceDetector = new SentenceDetectorME(new SentenceModel(new 
File(args[0])));
+    }
+    
+    URI tokenizerUri = URI.create(args[1]);
+    if ("tokenizer".equals(tokenizerUri.getScheme())) {
+      if ("whitespace".equals(tokenizerUri.getSchemeSpecificPart())) {
+        tokenizer = WhitespaceTokenizer.INSTANCE;
+      }
+      else if ("simple".equals(tokenizerUri.getSchemeSpecificPart())) {
+        tokenizer = SimpleTokenizer.INSTANCE;
+      } 
+      else {
+        System.out.println("unkown sentence detector");
+        return;
+      }
+
+    }
+    else {
+      tokenizer = new TokenizerME(new TokenizerModel(new File(args[1])));
+    }
+    
+    nameFinders = new TokenNameFinder[] {new NameFinderME(new 
TokenNameFinderModel(new URL(args[2])))};
+    
+    ServletContextHandler context = new ServletContextHandler(
+        ServletContextHandler.SESSIONS);
+    context.setContextPath("/");
+
+    Server jettyServer = new Server(8080);
+    jettyServer.setHandler(context);
+
+    ServletHolder jerseyServlet = context
+        
.addServlet(com.sun.jersey.spi.container.servlet.ServletContainer.class, "/*");
+    jerseyServlet.setInitParameter("com.sun.jersey.config.property.packages", 
"opennlp.bratannotator");
+    
jerseyServlet.setInitParameter("com.sun.jersey.api.json.POJOMappingFeature", 
"true");
+    jerseyServlet.setInitOrder(0);
+
+    jerseyServlet.setInitParameter("jersey.config.server.provider.classnames",
+        BratNameFinderResource.class.getCanonicalName());
+
+    try {
+      jettyServer.start();
+      jettyServer.join();
+    } finally {
+      jettyServer.destroy();
+    }
+  }
+}

Added: 
opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratNameFinderResource.java
URL: 
http://svn.apache.org/viewvc/opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratNameFinderResource.java?rev=1745485&view=auto
==============================================================================
--- 
opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratNameFinderResource.java
 (added)
+++ 
opennlp/sandbox/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratNameFinderResource.java
 Wed May 25 13:44:45 2016
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.bratannotator;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import javax.ws.rs.Consumes;
+import javax.ws.rs.POST;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+import javax.ws.rs.QueryParam;
+import javax.ws.rs.core.MediaType;
+
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.Span;
+
+@Path("/ner")
+public class BratNameFinderResource {
+
+  public static class NameAnn {
+    public int[][] offsets;
+    public String[] texts;
+    public String type;
+  }
+
+  private SentenceDetector sentDetect = BratAnnService.sentenceDetector;
+  private Tokenizer tokenizer = BratAnnService.tokenizer;
+  private TokenNameFinder nameFinders[] = BratAnnService.nameFinders;
+
+  private static int findNextNonWhitespaceChar(CharSequence s, int beginOffset,
+      int endOffset) {
+
+    for (int i = beginOffset; i < endOffset; i++) {
+      if (!Character.isSpaceChar(s.charAt(i))) {
+        return i;
+      }
+    }
+
+    return -1;
+  }
+
+  @POST
+  @Consumes(MediaType.TEXT_PLAIN)
+  @Produces(MediaType.APPLICATION_JSON)
+  public Map<String, NameAnn> findNames(@QueryParam("model") String modelName,
+      String text) {
+
+    Span sentenceSpans[] = sentDetect.sentPosDetect(text);
+
+    Map<String, NameAnn> map = new HashMap<String, NameAnn>();
+
+    int indexCounter = 0;
+
+    for (int i = 0; i < sentenceSpans.length; i++) {
+      
+      String sentenceText = sentenceSpans[i].getCoveredText(text).toString();
+      
+      // offset of sentence gets lost here!
+      Span tokenSpans[] = tokenizer
+          .tokenizePos(sentenceText);
+
+      String tokens[] = Span.spansToStrings(tokenSpans, sentenceText);
+
+      for (TokenNameFinder nameFinder : nameFinders) {
+        Span names[] = nameFinder.find(tokens);
+
+        for (Span name : names) {
+          
+          int beginOffset = tokenSpans[name.getStart()].getStart()
+              + sentenceSpans[i].getStart();
+          int endOffset = tokenSpans[name.getEnd() - 1].getEnd()
+              + sentenceSpans[i].getStart();
+
+          // create a list of new line indexes
+          List<Integer> newLineIndexes = new ArrayList<Integer>();
+
+          // TODO: Code needs to handle case that there are multiple new lines
+          // in a row
+
+          boolean inNewLineSequence = false;
+          for (int ci = beginOffset; ci < endOffset; ci++) {
+            if (text.charAt(ci) == '\n' || text.charAt(ci) == '\r') {
+              if (!inNewLineSequence) {
+                newLineIndexes.add(ci);
+              }
+              inNewLineSequence = true;
+            } else {
+              inNewLineSequence = false;
+            }
+          }
+
+          List<String> textSegments = new ArrayList<String>();
+          List<int[]> spanSegments = new ArrayList<int[]>();
+
+          int segmentBegin = beginOffset;
+
+          for (int newLineOffset : newLineIndexes) {
+            // create segment from begin to offset
+            textSegments.add(text.substring(segmentBegin, newLineOffset));
+            spanSegments.add(new int[] { segmentBegin, newLineOffset });
+
+            segmentBegin = findNextNonWhitespaceChar(text, newLineOffset + 1,
+                endOffset);
+
+            if (segmentBegin == -1) {
+              break;
+            }
+          }
+
+          // create left over segment
+          if (segmentBegin != -1) {
+            textSegments.add(text.substring(segmentBegin, endOffset));
+            spanSegments.add(new int[] { segmentBegin, endOffset });
+          }
+
+          NameAnn ann = new NameAnn();
+          ann.texts = textSegments.toArray(new String[textSegments.size()]);
+          ann.offsets = spanSegments.toArray(new int[spanSegments.size()][]);
+          ann.type = name.getType();
+
+          map.put(Integer.toString(indexCounter++), ann);
+        }
+      }
+    }
+
+    return map;
+  }
+}


Reply via email to